diff --git a/camelot/core.py b/camelot/core.py index 0934961..91cb1bd 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,69 +1,14 @@ +import os +import tempfile + import numpy as np import pandas as pd +from PyPDF2 import PdfFileReader, PdfFileWriter + +from .utils import get_page_layout, get_text_objects, get_rotation class Cell(object): - """Cell. - Defines a cell object with coordinates relative to a left-bottom - origin, which is also PDFMiner's coordinate space. - - Parameters - ---------- - x1 : float - x-coordinate of left-bottom point. - - y1 : float - y-coordinate of left-bottom point. - - x2 : float - x-coordinate of right-top point. - - y2 : float - y-coordinate of right-top point. - - Attributes - ---------- - lb : tuple - Tuple representing left-bottom coordinates. - - lt : tuple - Tuple representing left-top coordinates. - - rb : tuple - Tuple representing right-bottom coordinates. - - rt : tuple - Tuple representing right-top coordinates. - - bbox : tuple - Tuple representing the cell's bounding box using the - lower-bottom and right-top coordinates. - - left : bool - Whether or not cell is bounded on the left. - - right : bool - Whether or not cell is bounded on the right. - - top : bool - Whether or not cell is bounded on the top. - - bottom : bool - Whether or not cell is bounded on the bottom. - - text_objects : list - List of text objects assigned to cell. - - text : string - Text assigned to cell. - - spanning_h : bool - Whether or not cell spans/extends horizontally. - - spanning_v : bool - Whether or not cell spans/extends vertically. - """ - def __init__(self, x1, y1, x2, y2): self.x1 = x1 @@ -86,76 +31,23 @@ class Cell(object): self.image = None def add_text(self, text): - """Adds text to cell. - - Parameters - ---------- - text : string - """ self.text = ''.join([self.text, text]) def get_text(self): - """Returns text assigned to cell. - - Returns - ------- - text : string - """ return self.text def add_object(self, t_object): - """Adds PDFMiner text object to cell. - - Parameters - ---------- - t_object : object - """ self.text_objects.append(t_object) def get_objects(self): - """Returns list of text objects assigned to cell. - - Returns - ------- - text_objects : list - """ return self.text_objects def get_bounded_edges(self): - """Returns the number of edges by which a cell is bounded. - - Returns - ------- - bounded_edges : int - """ self.bounded_edges = self.top + self.bottom + self.left + self.right return self.bounded_edges class Table(object): - """Table. - Defines a table object with coordinates relative to a left-bottom - origin, which is also PDFMiner's coordinate space. - - Parameters - ---------- - cols : list - List of tuples representing column x-coordinates in increasing - order. - - rows : list - List of tuples representing row y-coordinates in decreasing - order. - - Attributes - ---------- - cells : list - List of cell objects with row-major ordering. - - nocont_ : int - Number of lines that did not contribute to setting cell edges. - """ - def __init__(self, cols, rows): self.cols = cols @@ -166,8 +58,6 @@ class Table(object): self.image = None def set_all_edges(self): - """Sets all table edges to True. - """ for r in range(len(self.rows)): for c in range(len(self.cols)): self.cells[r][c].left = True @@ -177,8 +67,6 @@ class Table(object): return self def set_border_edges(self): - """Sets table border edges to True. - """ for r in range(len(self.rows)): self.cells[r][0].left = True self.cells[r][len(self.cols) - 1].right = True @@ -188,19 +76,6 @@ class Table(object): return self def set_edges(self, vertical, horizontal, jtol=2): - """Sets a cell's edges to True depending on whether they - overlap with lines found by imgproc. - - Parameters - ---------- - vertical : list - List of vertical lines detected by imgproc. Coordinates - scaled and translated to the PDFMiner's coordinate space. - - horizontal : list - List of horizontal lines detected by imgproc. Coordinates - scaled and translated to the PDFMiner's coordinate space. - """ for v in vertical: # find closest x coord # iterate over y coords and find closest points @@ -308,10 +183,6 @@ class Table(object): return self def set_spanning(self): - """Sets a cell's spanning_h or spanning_v attribute to True - depending on whether the cell spans/extends horizontally or - vertically. - """ for r in range(len(self.rows)): for c in range(len(self.cols)): bound = self.cells[r][c].get_bounded_edges() @@ -351,13 +222,6 @@ class Table(object): return self def get_list(self): - """Returns a two-dimensional list of text assigned to each - cell. - - Returns - ------- - ar : list - """ ar = [] for r in range(len(self.rows)): ar.append([self.cells[r][c].get_text().strip() @@ -366,4 +230,76 @@ class Table(object): class TableSet(object): - pass \ No newline at end of file + pass + + +class FileHandler(object): + def __init__(self, filename, pages='1'): + self.filename = filename + if not self.filename.endswith('.pdf'): + raise TypeError("File format not supported.") + self.pages = __get_pages(pages) + self.temp = tempfile.mkdtemp() + + @staticmethod + def __get_pages(filename, pages): + p = {} + if pages == '1': + p.append({'start': 1, 'end': 1}) + else: + infile = PdfFileReader(open(filename, 'rb'), strict=False) + if pages == 'all': + p.append({'start': 1, 'end': infile.getNumPages()}) + else: + for r in pages.split(','): + if '-' in r: + a, b = r.split('-') + if b == 'end': + b = infile.getNumPages() + p.append({'start': int(a), 'end': int(b)}) + else: + p.append({'start': int(r), 'end': int(r)}) + return p + + @staticmethod + def __save_page(filename, page, temp): + with open(filename, 'rb') as fileobj: + infile = PdfFileReader(fileobj, strict=False) + fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) + fname, fext = os.path.splitext(fpath) + p = infile.getPage(page - 1) + outfile = PdfFileWriter() + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + layout, dim = get_page_layout(fpath) + # fix rotated pdf + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") + rotation = get_rotation(lttextlh, lttextlv, ltchar) + if rotation != '': + fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext]) + os.rename(fpath, fpath_new) + infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + outfile = PdfFileWriter() + p = infile.getPage(0) + if rotation == 'left': + p.rotateClockwise(90) + elif rotation == 'right': + p.rotateCounterClockwise(90) + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + + def parse(self): + for p in self.pages: + __save_page(self.filename, p, self.temp) + pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) + for p in self.pagenos] + tables = {} + for p in pages: + table = self.parser.get_tables(p) + if table is not None: + tables.update(table) + return tables \ No newline at end of file diff --git a/camelot/pdf.py b/camelot/pdf.py deleted file mode 100644 index 08fd26c..0000000 --- a/camelot/pdf.py +++ /dev/null @@ -1,268 +0,0 @@ -import os -import shutil -import tempfile -import itertools -import multiprocessing as mp -from functools import partial - -import cv2 -from PyPDF2 import PdfFileReader, PdfFileWriter - -from .utils import get_page_layout, get_text_objects, get_rotation - - -__all__ = ['Pdf'] - - -def _parse_page_numbers(pagenos): - """Converts list of dicts to list of ints. - - Parameters - ---------- - pagenos : list - List of dicts representing page ranges. A dict must have only - two keys named 'start' and 'end' having int as their value. - - Returns - ------- - page_numbers : list - List of int page numbers. - """ - page_numbers = [] - for p in pagenos: - page_numbers.extend(range(p['start'], p['end'] + 1)) - page_numbers = sorted(set(page_numbers)) - return page_numbers - - -def _save_page(temp, pdfname, pageno): - with open(pdfname, 'rb') as pdffile: - infile = PdfFileReader(pdffile, strict=False) - sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno)) - sp_name, sp_ext = os.path.splitext(sp_path) - page = infile.getPage(pageno - 1) - outfile = PdfFileWriter() - outfile.addPage(page) - with open(sp_path, 'wb') as f: - outfile.write(f) - layout, dim = get_page_layout(sp_path) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - rotation = get_rotation(lttextlh, lttextlv, ltchar) - if rotation != '': - sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext]) - os.rename(sp_path, sp_new_path) - sp_in = PdfFileReader(open(sp_new_path, 'rb'), - strict=False) - sp_out = PdfFileWriter() - sp_page = sp_in.getPage(0) - if rotation == 'left': - sp_page.rotateClockwise(90) - elif rotation == 'right': - sp_page.rotateCounterClockwise(90) - sp_out.addPage(sp_page) - with open(sp_path, 'wb') as pdf_out: - sp_out.write(pdf_out) - - -class Pdf: - """Pdf manager. - Handles all operations like temp directory creation, splitting file - into single page pdfs, running extraction using multiple processes - and removing the temp directory. - - Parameters - ---------- - extractor : object - camelot.stream.Stream or camelot.lattice.Lattice extractor - object. - - pdfname : string - Path to pdf file. - - pagenos : list - List of dicts representing page ranges. A dict must have only - two keys named 'start' and 'end' having int as their value. - (optional, default: [{'start': 1, 'end': 1}]) - - parallel : bool - Whether or not to run using multiple processes. - (optional, default: False) - - clean : bool - Whether or not to remove the temp directory. - (optional, default: False) - """ - - def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}], - parallel=False, clean=False): - - self.extractor = extractor - self.pdfname = pdfname - if not self.pdfname.endswith('.pdf'): - raise TypeError("File format not supported.") - self.pagenos = _parse_page_numbers(pagenos) - self.parallel = parallel - if self.parallel: - self.cpu_count = mp.cpu_count() - self.pool = mp.Pool(processes=self.cpu_count) - self.clean = clean - self.temp = tempfile.mkdtemp() - - def split(self): - """Splits file into single page pdfs. - """ - if self.parallel: - pfunc = partial(_save_page, self.temp, self.pdfname) - self.pool.map(pfunc, self.pagenos) - else: - for p in self.pagenos: - _save_page(self.temp, self.pdfname, p) - - - def extract(self): - """Runs table extraction by calling extractor.get_tables - on all single page pdfs. - """ - self.split() - pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) - for p in self.pagenos] - if self.parallel: - tables = self.pool.map(self.extractor.get_tables, pages) - tables = {k: v for d in tables if d is not None for k, v in d.items()} - else: - tables = {} - if self.extractor.debug: - if self.extractor.method == 'stream': - self.debug = self.extractor.debug - self.debug_text = [] - elif self.extractor.method in ['lattice', 'ocrl']: - self.debug = self.extractor.debug - self.debug_images = [] - self.debug_segments = [] - self.debug_tables = [] - elif self.extractor.method == 'ocrs': - self.debug = self.extractor.debug - self.debug_images = [] - for p in pages: - table = self.extractor.get_tables(p) - if table is not None: - tables.update(table) - if self.extractor.debug: - if self.extractor.method == 'stream': - self.debug_text.append(self.extractor.debug_text) - elif self.extractor.method in ['lattice', 'ocr']: - self.debug_images.append(self.extractor.debug_images) - self.debug_segments.append(self.extractor.debug_segments) - self.debug_tables.append(self.extractor.debug_tables) - elif self.extractor.method == 'ocrs': - self.debug_images.append(self.extractor.debug_images) - if self.clean: - self.remove_tempdir() - return tables - - def remove_tempdir(self): - """Removes temporary directory that was created to save single - page pdfs and their images. - """ - shutil.rmtree(self.temp) - - def debug_plot(self): - """Generates a matplotlib plot based on the selected extractor - debug option. - """ - import matplotlib.pyplot as plt - import matplotlib.patches as patches - - if self.debug is True: - if hasattr(self, 'debug_text'): - for text in self.debug_text: - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in text: - xs.extend([t[0], t[1]]) - ys.extend([t[2], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1] - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() - elif hasattr(self, 'debug_images'): - for img in self.debug_images: - plt.imshow(img) - plt.show() - elif self.debug == 'contour': - try: - for img, table_bbox in self.debug_images: - for t in table_bbox.keys(): - cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 3) - plt.imshow(img) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'joint': - try: - for img, table_bbox in self.debug_images: - x_coord = [] - y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - max_x, max_y = max(x_coord), max(y_coord) - plt.plot(x_coord, y_coord, 'ro') - plt.axis([0, max_x + 100, max_y + 100, 0]) - plt.imshow(img) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'line': - try: - for v_s, h_s in self.debug_segments: - for v in v_s: - plt.plot([v[0], v[2]], [v[1], v[3]]) - for h in h_s: - plt.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'table': - try: - for tables in self.debug_tables: - for table in tables: - for r in range(len(table.rows)): - for c in range(len(table.cols)): - if table.cells[r][c].left: - plt.plot([table.cells[r][c].lb[0], - table.cells[r][c].lt[0]], - [table.cells[r][c].lb[1], - table.cells[r][c].lt[1]]) - if table.cells[r][c].right: - plt.plot([table.cells[r][c].rb[0], - table.cells[r][c].rt[0]], - [table.cells[r][c].rb[1], - table.cells[r][c].rt[1]]) - if table.cells[r][c].top: - plt.plot([table.cells[r][c].lt[0], - table.cells[r][c].rt[0]], - [table.cells[r][c].lt[1], - table.cells[r][c].rt[1]]) - if table.cells[r][c].bottom: - plt.plot([table.cells[r][c].lb[0], - table.cells[r][c].rb[0]], - [table.cells[r][c].lb[1], - table.cells[r][c].rb[1]]) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - else: - raise UserWarning("This method can only be called after" - " debug has been specified.") \ No newline at end of file