import sys
import weasyprint
import re

import logging

#config
override_css = ['override.css'] #user style; recommended to at least set @page css attribute to fit the site you are scraping better
output_pdf = 'output.pdf'
debug = False

#weasyprint <=52.2 might result in broken pdf if there are custom fonts apparently

#crawl2pdf.py <scope> <url1> [url2] [url3]...
#crawls a site and turn included pages into pdf according to the regex scope specified

#init vars
resolved_urls = []
try:
    scope = sys.argv[1]  #scope in regex

    urls = sys.argv[2:]  #initial urls
    if(len(urls) == 0):
        raise IndexError()

    for css in override_css:  #check file existence
        with open(css) as f:
            continue
except IOError:
    print('invalid user css file(s) specified, ignoring...')
    override_css = None
except IndexError:
    print('Please enter both the regex scope and the url to fetch!')
    print('Syntax: crawl2pypdf.py <scope> <url1> [url2] [url3]...')
    quit()

def merge_pdf(base_doc, new_doc):
    if(new_doc != None):
        base_doc.metadata.attachments += new_doc.metadata.attachments
        base_doc.pages += new_doc.pages
    return base_doc

#shouldnt need timeout to avoid rate limit - rendering takes a lot of time already; just do it sync
def resolve_into_pdf(url):
    print('resolving ' + url + '...')
    try:
        doc = weasyprint.HTML(url).render(stylesheets=override_css, optimize_images=True, presentational_hints=True)
    except weasyprint.urls.URLFetchingError as e:
        print('failed to fetch - ' + str(e))
        return None
    
    for link in sum([x[0] for x in weasyprint.document.resolve_links(doc.pages)], []):  #flatten list of page tuples of list of link tuples, concatenate then iterate
        if(link[0] == 'external'):
            new_url = link[1].split('#')[0] #remove fragment identifier as it will be the same page
            if(new_url not in resolved_urls and re.match(scope, new_url)): #prevent infinite recursion; do not resolve if external (not in scope)
                #if servers allow multiple urls to the same page this might result in duplicates (e.g. foo/index.html and foo/)
                resolved_urls.append(new_url)
                try:
                    print('from ' + url, end=' ')  #prepend additional info about traversal to output
                    doc = merge_pdf(doc, resolve_into_pdf(new_url))
                except AssertionError:  #might be non html files that we accidentally run into; still its recommended to include some sort of check in scope
                    continue
    return doc


if debug:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    weasyprint.LOGGER.setLevel(logging.DEBUG)
    weasyprint.PROGRESS_LOGGER.setLevel(logging.DEBUG)
    weasyprint.LOGGER.addHandler(handler)
    weasyprint.PROGRESS_LOGGER.addHandler(handler)

root_doc = resolve_into_pdf(urls[0]) #initial pdf to append pages on

#resolve additional urls if any
for url in urls[1:]:  #no need checking as sublists does not throw error but returns empty list if out of bounds
        root_doc = merge_pdf(root_doc, resolve_into_pdf(url))

print('writing pdf...')
root_doc.write_pdf(output_pdf)




