diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e3b986 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +* +!.gitignore +!crawl2pdf.py +!crawl2pypdf.py \ No newline at end of file diff --git a/crawl2pdf.py b/crawl2pdf.py new file mode 100644 index 0000000..7d4193a --- /dev/null +++ b/crawl2pdf.py @@ -0,0 +1,85 @@ +import sys +import weasyprint +import re + +import logging + +#config +override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better +output_pdf = 'output.pdf' +debug = False + +#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently + +#crawl2pdf.py [url2] [url3]... +#crawls a site and turn included pages into pdf according to the regex scope specified + +#init vars +resolved_urls = [] +try: + scope = sys.argv[1] #scope in regex + + urls = sys.argv[2:] #initial urls + if(len(urls) == 0): + raise IndexError() + + for css in override_css: #check file existence + with open(css) as f: + continue +except IOError: + print('invalid user css file(s) specified, ignoring...') + override_css = None +except IndexError: + print('Please enter both the regex scope and the url to fetch!') + print('Syntax: crawl2pypdf.py [url2] [url3]...') + quit() + +def merge_pdf(base_doc, new_doc): + if(new_doc != None): + base_doc.metadata.attachments += new_doc.metadata.attachments + base_doc.pages += new_doc.pages + return base_doc + +#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync +def resolve_into_pdf(url): + print('resolving ' + url + '...') + try: + doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True) + except weasyprint.urls.URLFetchingError as e: + print('failed to fetch - ' + str(e)) + return None + + for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []): #flatten list of page tuples of list of link tuples, concatenate then iterate + if(link[0] == 'external'): + new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page + if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope) + #if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/) + resolved_urls.append(new_url) + try: + print('from ' + url, end=' ') #prepend additional info about traversal to output + doc = merge_pdf(doc, resolve_into_pdf(new_url)) + except AssertionError: #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope + continue + return doc + + +if debug: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + weasyprint.LOGGER.setLevel(logging.DEBUG) + weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG) + weasyprint.LOGGER.addHandler(handler) + weasyprint.PROGRESS_LOGGER.addHandler(handler) + +root_doc = resolve_into_pdf(urls[0]) #initial pdf to append pages on + +#resolve additional urls if any +for url in urls[1:]: #no need checking as sublists does not throw error but returns empty list if out of bounds + root_doc = merge_pdf(root_doc, resolve_into_pdf(url)) + +print('writing pdf...') +root_doc.write_pdf(output_pdf) + + + + diff --git a/crawl2pypdf.py b/crawl2pypdf.py new file mode 100644 index 0000000..903bc77 --- /dev/null +++ b/crawl2pypdf.py @@ -0,0 +1,107 @@ +import sys +import weasyprint +import re +import PyPDF2 +import io + +import logging + +#config +override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better +output_pdf = 'output.pdf' +debug = False + +#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently + +#weasyprint merging using docs.pages doesnt always work well - some text gets broken so this version utilizes pypdf2; +#even then, there might be patches needed in pypdf2 to read the non standard pdfs generated by weasyprint sometimes: +#utils.py - in b_(s): `r = s.encode('latin-1')` -> `r = s.encode('UTF-8')` +#generic.py - in getOutlines(self, node=None, outlines=None): add try catch statement to `node = node["/Next"]` and break if encounter ValueError + +#crawl2pypdf.py [url2] [url3]... +#crawls a site and turn included pages into pdf according to the regex scope specified using pypdf2 merging + +#init vars +resolved_urls = [] +try: + scope = sys.argv[1] #scope in regex + + urls = sys.argv[2:] #initial urls + if(len(urls) == 0): + raise IndexError() + + for css in override_css: #check file existence + with open(css) as f: + continue +except IOError: + print('invalid user css file(s) specified, ignoring...') + override_css = None +except IndexError: + print('Please enter both the regex scope and the url to fetch!') + print('Syntax: crawl2pypdf.py [url2] [url3]...') + quit() + +#recursive merging to maintain order and to avoid problems with too many merged pdfs in a single merger +def merge_pdf(base_pdf_stream, new_pdf_stream): + if(new_pdf_stream == None): + return base_pdf_stream + + merger = PyPDF2.PdfFileMerger(strict=False) #needed or else some pdfs wont be read + + merger.append(base_pdf_stream) + merger.append(new_pdf_stream) + + merge_stream = io.BytesIO() + merger.write(merge_stream) + + base_pdf_stream.close() + new_pdf_stream.close() + merger.close() + + return merge_stream + +#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync +def resolve_into_pdf(url): + print('resolving ' + url + '...') + try: + doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True) + stream = io.BytesIO(doc.write_pdf()) + except weasyprint.urls.URLFetchingError as e: + print('failed to fetch - ' + str(e)) + return None + + for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []): #flatten list of page tuples of list of link tuples, concatenate then iterate + if(link[0] == 'external'): + new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page + if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope) + #if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/) + resolved_urls.append(new_url) + try: + print('from ' + url, end=' ') #prepend additional info about traversal to output + stream = merge_pdf(stream, resolve_into_pdf(new_url)) + except AssertionError: #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope + continue + return stream + + +if debug: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + weasyprint.LOGGER.setLevel(logging.DEBUG) + weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG) + weasyprint.LOGGER.addHandler(handler) + weasyprint.PROGRESS_LOGGER.addHandler(handler) + +root_stream = resolve_into_pdf(urls[0]) #initial pdf to append pages on + +#resolve additional urls if any +for url in urls[1:]: #no need checking as sublists does not throw error but returns empty list if out of bounds + root_stream = merge_pdf(root_stream, resolve_into_pdf(url)) + +print('writing pdf...') +with open(output_pdf, "wb") as f: + f.write(root_stream.getbuffer()) + + + +