No OneTemporary
Actions

Size

8 KB

Subscribers

None

View Options

	diff --git a/.gitignore b/.gitignore
	new file mode 100644
	index 0000000..6e3b986
	--- /dev/null
	+++ b/.gitignore
	@@ -0,0 +1,4 @@
	+*
	+!.gitignore
	+!crawl2pdf.py
	+!crawl2pypdf.py
	\ No newline at end of file
	diff --git a/crawl2pdf.py b/crawl2pdf.py
	new file mode 100644
	index 0000000..7d4193a
	--- /dev/null
	+++ b/crawl2pdf.py
	@@ -0,0 +1,85 @@
	+import sys
	+import weasyprint
	+import re
	+
	+import logging
	+
	+#config
	+override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better
	+output_pdf = 'output.pdf'
	+debug = False
	+
	+#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently
	+
	+#crawl2pdf.py <scope> <url1> [url2] [url3]...
	+#crawls a site and turn included pages into pdf according to the regex scope specified
	+
	+#init vars
	+resolved_urls = []
	+try:
	+ scope = sys.argv[1] #scope in regex
	+
	+ urls = sys.argv[2:] #initial urls
	+ if(len(urls) == 0):
	+ raise IndexError()
	+
	+ for css in override_css: #check file existence
	+ with open(css) as f:
	+ continue
	+except IOError:
	+ print('invalid user css file(s) specified, ignoring...')
	+ override_css = None
	+except IndexError:
	+ print('Please enter both the regex scope and the url to fetch!')
	+ print('Syntax: crawl2pypdf.py <scope> <url1> [url2] [url3]...')
	+ quit()
	+
	+def merge_pdf(base_doc, new_doc):
	+ if(new_doc != None):
	+ base_doc.metadata.attachments += new_doc.metadata.attachments
	+ base_doc.pages += new_doc.pages
	+ return base_doc
	+
	+#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync
	+def resolve_into_pdf(url):
	+ print('resolving ' + url + '...')
	+ try:
	+ doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True)
	+ except weasyprint.urls.URLFetchingError as e:
	+ print('failed to fetch - ' + str(e))
	+ return None
	+
	+ for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []): #flatten list of page tuples of list of link tuples, concatenate then iterate
	+ if(link[0] == 'external'):
	+ new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page
	+ if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope)
	+ #if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/)
	+ resolved_urls.append(new_url)
	+ try:
	+ print('from ' + url, end=' ') #prepend additional info about traversal to output
	+ doc = merge_pdf(doc, resolve_into_pdf(new_url))
	+ except AssertionError: #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope
	+ continue
	+ return doc
	+
	+
	+if debug:
	+ handler = logging.StreamHandler(sys.stdout)
	+ handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
	+ weasyprint.LOGGER.setLevel(logging.DEBUG)
	+ weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG)
	+ weasyprint.LOGGER.addHandler(handler)
	+ weasyprint.PROGRESS_LOGGER.addHandler(handler)
	+
	+root_doc = resolve_into_pdf(urls[0]) #initial pdf to append pages on
	+
	+#resolve additional urls if any
	+for url in urls[1:]: #no need checking as sublists does not throw error but returns empty list if out of bounds
	+ root_doc = merge_pdf(root_doc, resolve_into_pdf(url))
	+
	+print('writing pdf...')
	+root_doc.write_pdf(output_pdf)
	+
	+
	+
	+
	diff --git a/crawl2pypdf.py b/crawl2pypdf.py
	new file mode 100644
	index 0000000..903bc77
	--- /dev/null
	+++ b/crawl2pypdf.py
	@@ -0,0 +1,107 @@
	+import sys
	+import weasyprint
	+import re
	+import PyPDF2
	+import io
	+
	+import logging
	+
	+#config
	+override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better
	+output_pdf = 'output.pdf'
	+debug = False
	+
	+#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently
	+
	+#weasyprint merging using docs.pages doesnt always work well - some text gets broken so this version utilizes pypdf2;
	+#even then, there might be patches needed in pypdf2 to read the non standard pdfs generated by weasyprint sometimes:
	+#utils.py - in b_(s): `r = s.encode('latin-1')` -> `r = s.encode('UTF-8')`
	+#generic.py - in getOutlines(self, node=None, outlines=None): add try catch statement to `node = node["/Next"]` and break if encounter ValueError
	+
	+#crawl2pypdf.py <scope> <url1> [url2] [url3]...
	+#crawls a site and turn included pages into pdf according to the regex scope specified using pypdf2 merging
	+
	+#init vars
	+resolved_urls = []
	+try:
	+ scope = sys.argv[1] #scope in regex
	+
	+ urls = sys.argv[2:] #initial urls
	+ if(len(urls) == 0):
	+ raise IndexError()
	+
	+ for css in override_css: #check file existence
	+ with open(css) as f:
	+ continue
	+except IOError:
	+ print('invalid user css file(s) specified, ignoring...')
	+ override_css = None
	+except IndexError:
	+ print('Please enter both the regex scope and the url to fetch!')
	+ print('Syntax: crawl2pypdf.py <scope> <url1> [url2] [url3]...')
	+ quit()
	+
	+#recursive merging to maintain order and to avoid problems with too many merged pdfs in a single merger
	+def merge_pdf(base_pdf_stream, new_pdf_stream):
	+ if(new_pdf_stream == None):
	+ return base_pdf_stream
	+
	+ merger = PyPDF2.PdfFileMerger(strict=False) #needed or else some pdfs wont be read
	+
	+ merger.append(base_pdf_stream)
	+ merger.append(new_pdf_stream)
	+
	+ merge_stream = io.BytesIO()
	+ merger.write(merge_stream)
	+
	+ base_pdf_stream.close()
	+ new_pdf_stream.close()
	+ merger.close()
	+
	+ return merge_stream
	+
	+#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync
	+def resolve_into_pdf(url):
	+ print('resolving ' + url + '...')
	+ try:
	+ doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True)
	+ stream = io.BytesIO(doc.write_pdf())
	+ except weasyprint.urls.URLFetchingError as e:
	+ print('failed to fetch - ' + str(e))
	+ return None
	+
	+ for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []): #flatten list of page tuples of list of link tuples, concatenate then iterate
	+ if(link[0] == 'external'):
	+ new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page
	+ if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope)
	+ #if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/)
	+ resolved_urls.append(new_url)
	+ try:
	+ print('from ' + url, end=' ') #prepend additional info about traversal to output
	+ stream = merge_pdf(stream, resolve_into_pdf(new_url))
	+ except AssertionError: #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope
	+ continue
	+ return stream
	+
	+
	+if debug:
	+ handler = logging.StreamHandler(sys.stdout)
	+ handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
	+ weasyprint.LOGGER.setLevel(logging.DEBUG)
	+ weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG)
	+ weasyprint.LOGGER.addHandler(handler)
	+ weasyprint.PROGRESS_LOGGER.addHandler(handler)
	+
	+root_stream = resolve_into_pdf(urls[0]) #initial pdf to append pages on
	+
	+#resolve additional urls if any
	+for url in urls[1:]: #no need checking as sublists does not throw error but returns empty list if out of bounds
	+ root_stream = merge_pdf(root_stream, resolve_into_pdf(url))
	+
	+print('writing pdf...')
	+with open(output_pdf, "wb") as f:
	+ f.write(root_stream.getbuffer())
	+
	+
	+
	+

File Metadata

Mime Type: text/x-diff
Expires: Sat, Sep 21, 1:16 AM (18 h, 9 m)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 1f/43/da74194c09923cebf3ee953e76f2

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions