Page MenuHomedesp's stash

crawl2pypdf.py
No OneTemporary

crawl2pypdf.py

import sys
import weasyprint
import re
import PyPDF2
import io
import logging
#config
override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better
output_pdf = 'output.pdf'
debug = False
#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently
#weasyprint merging using docs.pages doesnt always work well - some text gets broken so this version utilizes pypdf2;
#even then, there might be patches needed in pypdf2 to read the non standard pdfs generated by weasyprint sometimes:
#utils.py - in b_(s): `r = s.encode('latin-1')` -> `r = s.encode('UTF-8')`
#generic.py - in getOutlines(self, node=None, outlines=None): add try catch statement to `node = node["/Next"]` and break if encounter ValueError
#crawl2pypdf.py <scope> <url1> [url2] [url3]...
#crawls a site and turn included pages into pdf according to the regex scope specified using pypdf2 merging
#init vars
resolved_urls = []
try:
scope = sys.argv[1] #scope in regex
urls = sys.argv[2:] #initial urls
if(len(urls) == 0):
raise IndexError()
for css in override_css: #check file existence
with open(css) as f:
continue
except IOError:
print('invalid user css file(s) specified, ignoring...')
override_css = None
except IndexError:
print('Please enter both the regex scope and the url to fetch!')
print('Syntax: crawl2pypdf.py <scope> <url1> [url2] [url3]...')
quit()
#recursive merging to maintain order and to avoid problems with too many merged pdfs in a single merger
def merge_pdf(base_pdf_stream, new_pdf_stream):
if(new_pdf_stream == None):
return base_pdf_stream
merger = PyPDF2.PdfFileMerger(strict=False) #needed or else some pdfs wont be read
merger.append(base_pdf_stream)
merger.append(new_pdf_stream)
merge_stream = io.BytesIO()
merger.write(merge_stream)
base_pdf_stream.close()
new_pdf_stream.close()
merger.close()
return merge_stream
#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync
def resolve_into_pdf(url):
print('resolving ' + url + '...')
try:
doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True)
stream = io.BytesIO(doc.write_pdf())
except weasyprint.urls.URLFetchingError as e:
print('failed to fetch - ' + str(e))
return None
for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []): #flatten list of page tuples of list of link tuples, concatenate then iterate
if(link[0] == 'external'):
new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page
if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope)
#if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/)
resolved_urls.append(new_url)
try:
print('from ' + url, end=' ') #prepend additional info about traversal to output
stream = merge_pdf(stream, resolve_into_pdf(new_url))
except AssertionError: #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope
continue
return stream
if debug:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
weasyprint.LOGGER.setLevel(logging.DEBUG)
weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG)
weasyprint.LOGGER.addHandler(handler)
weasyprint.PROGRESS_LOGGER.addHandler(handler)
root_stream = resolve_into_pdf(urls[0]) #initial pdf to append pages on
#resolve additional urls if any
for url in urls[1:]: #no need checking as sublists does not throw error but returns empty list if out of bounds
root_stream = merge_pdf(root_stream, resolve_into_pdf(url))
print('writing pdf...')
with open(output_pdf, "wb") as f:
f.write(root_stream.getbuffer())

File Metadata

Mime Type
text/x-python
Expires
Sat, Jul 5, 3:23 PM (23 h, 16 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
06/b0/48beb447c35c93caa1d8f3a60239

Event Timeline