# -*- coding: utf-8 -*- import os import sys import logging from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( build_file_path_in_temp_dir, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) logger = logging.getLogger("camelot") PARSERS = { "lattice": Lattice, "stream": Stream } class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: infile = PdfFileReader(open(filepath, "rb"), strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _read_pdf_page(self, page=1, layout_kwargs=None): """Saves specified page from PDF into a temporary directory. Removes password protection and normalizes rotation. Parameters ---------- page : int Page number. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa Returns ------- layout : object dimensions : tuple The dimensions of the pdf page filepath : str The path of the single page PDF - either the original, or a normalized version. """ layout_kwargs = layout_kwargs or {} with open(self.filepath, "rb") as fileobj: # Normalize the pdf file, but skip if it's not encrypted or has # only one page. infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = build_file_path_in_temp_dir( "page-{page}.pdf".format(page=page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dimensions = get_page_layout( fpath, **layout_kwargs) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, "rb"), strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dimensions = get_page_layout( fpath, **layout_kwargs) return layout, dimensions, fpath def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ layout_kwargs = layout_kwargs or {} tables = [] parser_obj = PARSERS[flavor] parser = parser_obj(**kwargs) # Read the layouts/dimensions of each of the pages we need to # parse. This might require creating a temporary .pdf. for page_idx in self.pages: layout, dimensions, source_file = self._read_pdf_page( page_idx, layout_kwargs=layout_kwargs ) parser._generate_layout(source_file, layout, dimensions, page_idx, layout_kwargs) rootname = os.path.basename(parser.rootname) if not suppress_stdout: logger.info( "Processing {rootname}".format(rootname=rootname)) t = parser.extract_tables( source_file ) tables.extend(t) return TableList(sorted(tables))