diff --git a/camelot/core.py b/camelot/core.py index fe52411..7170221 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -10,6 +10,11 @@ from operator import itemgetter import numpy as np import pandas as pd +from .utils import ( + compute_accuracy, + compute_whitespace, +) + # minimum number of vertical textline intersections for a textedge # to be considered valid @@ -479,6 +484,9 @@ class Table(object): self.whitespace = 0 self.order = None self.page = None + self.flavor = None # Flavor of the parser that generated the table + self.pdf_size = None # Dimensions of the original PDF page + self.debug_info = None # Field holding debug data def __repr__(self): return "<{} shape={}>".format(self.__class__.__name__, self.shape) @@ -513,6 +521,17 @@ class Table(object): } return report + def fill_data(self, parser): + self.flavor = parser.id + self.debug_info = parser.debug_info + data = self.data + self.df = pd.DataFrame(data) + self.shape = self.df.shape + + self.whitespace = compute_whitespace(data) + + self.pdf_size = (parser.pdf_width, parser.pdf_height) + def set_all_edges(self): """Sets all table edges to True. """ @@ -747,6 +766,7 @@ class Table(object): "encoding": "utf-8", } kw.update(kwargs) + # pylint: disable=abstract-class-instantiated writer = pd.ExcelWriter(path) self.df.to_excel(writer, **kw) writer.save() @@ -874,6 +894,7 @@ class TableList(object): self._compress_dir(**kwargs) elif f == "excel": filepath = os.path.join(dirname, basename) + # pylint: disable=abstract-class-instantiated writer = pd.ExcelWriter(filepath) for table in self._tables: sheet_name = "page-{}-table-{}".format(table.page, table.order) diff --git a/camelot/handlers.py b/camelot/handlers.py index 3a6d663..9b805fa 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -101,26 +101,32 @@ class PDFHandler(object): temp : str Tmp directory. + Returns + ------- + fpath : str + The path of the single page PDF created. + """ + fpath = os.path.join(temp, "page-{0}.pdf".format(page)) with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) - fpath = os.path.join(temp, "page-{0}.pdf".format(page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) - layout, dim = get_page_layout(fpath) + layout, __ = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + fpath_new = "".join( + [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, "rb"), strict=False) if infile.isEncrypted: @@ -134,9 +140,11 @@ class PDFHandler(object): outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) + return fpath def parse( - self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs + self, flavor="lattice", suppress_stdout=False, + layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -149,7 +157,7 @@ class PDFHandler(object): suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) - A dict of `pdfminer.layout.LAParams `_ kwargs. + A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa kwargs : dict See camelot.read_pdf kwargs. @@ -161,15 +169,22 @@ class PDFHandler(object): """ tables = [] with TemporaryDirectory() as tempdir: - for p in self.pages: - self._save_page(self.filepath, p, tempdir) - pages = [ - os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages - ] - parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) - for p in pages: + parser = \ + Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + + # For each of the pages we need to parse, generate a single page + # .pdf in a temporary folder. + for page_idx in self.pages: + single_page_pdf_file = self._save_page( + self.filepath, + page_idx, + tempdir + ) t = parser.extract_tables( - p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + single_page_pdf_file, + page_idx, + suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 7b87101..f5ceedb 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -2,11 +2,13 @@ from __future__ import division -import cv2 +from cv2 import cv2 import numpy as np -def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): +def adaptive_threshold( + imagename, process_background=False, blocksize=15, c=-2 +): """Thresholds an image using OpenCV's adaptiveThreshold. Parameters @@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa c : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa Returns ------- @@ -39,7 +41,9 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): if process_background: threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c + gray, 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, blocksize, c ) else: threshold = cv2.adaptiveThreshold( @@ -54,7 +58,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def find_lines( - threshold, regions=None, direction="horizontal", line_scale=15, iterations=0 + threshold, regions=None, direction="horizontal", + line_scale=15, iterations=0 ): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -78,7 +83,7 @@ def find_lines( iterations : int, optional (default: 0) Number of times for erosion/dilation is applied. - For more information, refer `OpenCV's dilate `_. + For more information, refer `OpenCV's dilate `_. # noqa Returns ------- @@ -100,13 +105,14 @@ def find_lines( size = threshold.shape[1] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: - raise ValueError("Specify direction as either 'vertical' or 'horizontal'") + raise ValueError("Specify direction as either 'vertical' " + "or 'horizontal'") if regions is not None: region_mask = np.zeros(threshold.shape) for region in regions: x, y, w, h = region - region_mask[y : y + h, x : x + w] = 1 + region_mask[y: y + h, x: x + w] = 1 threshold = np.multiply(threshold, region_mask) threshold = cv2.erode(threshold, el) @@ -115,12 +121,16 @@ def find_lines( try: _, contours, _ = cv2.findContours( - threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + threshold.astype(np.uint8), + cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE ) except ValueError: # for opencv backward compatibility contours, _ = cv2.findContours( - threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + threshold.astype(np.uint8), + cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE ) for c in contours: @@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal): tables = {} for c in contours: x, y, w, h = c - roi = joints[y : y + h, x : x + w] + roi = joints[y: y + h, x: x + w] try: __, jc, __ = cv2.findContours( roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index cb1bc21..14446b1 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -2,19 +2,94 @@ import os -from ..utils import get_page_layout, get_text_objects +from ..utils import ( + get_page_layout, + get_text_objects +) +from ..core import Table + +from ..image_processing import ( + adaptive_threshold, + find_lines, + find_contours, + find_joints +) + +# Pylint can't detect contents of cv2 +from cv2 import imread # pylint: disable=no-name-in-module class BaseParser(object): """Defines a base parser. """ + def __init__(self, parser_id): + self.imagename = None + self.pdf_image = None + self.id = parser_id - def _generate_layout(self, filename, layout_kwargs): + # For plotting details of parsing algorithms + self.debug_info = {} + + def _generate_layout(self, filename, page_idx, layout_kwargs): self.filename = filename self.layout_kwargs = layout_kwargs - self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) + self.layout, self.dimensions = get_page_layout( + filename, + **layout_kwargs + ) self.images = get_text_objects(self.layout, ltype="image") - self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") - self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") + self.horizontal_text = get_text_objects( + self.layout, + ltype="horizontal_text" + ) + self.vertical_text = get_text_objects( + self.layout, + ltype="vertical_text" + ) self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) + + self.page = page_idx + + def generate_image(self): + if self.pdf_image is None: + self._generate_image_file() + self.pdf_image = imread(self.imagename) + + def _generate_image_file(self): + if self.imagename: + return + from ..ext.ghostscript import Ghostscript + + self.imagename = "".join([self.rootname, ".png"]) + gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( + self.imagename, self.filename + ) + gs_call = gs_call.encode().split() + null = open(os.devnull, "wb") + Ghostscript(*gs_call, stdout=null) + # with Ghostscript(*gs_call, stdout=null) as gs: + # pass + null.close() + + """Initialize new table object, ready to be populated + + Parameters + ---------- + table_idx : int + Index of this table within the pdf page analyzed + cols : list + list of coordinate boundaries tuples (left, right) + rows : list + list of coordinate boundaries tuples (bottom, top) + + Returns + ------- + t : camelot.core.Table + + """ + def _initialize_new_table(self, table_idx, cols, rows): + table = Table(cols, rows) + table.page = self.page + table.order = table_idx + 1 + return table diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 197ff9f..a96f8df 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -13,7 +13,6 @@ import numpy as np import pandas as pd from .base import BaseParser -from ..core import Table from ..utils import ( scale_image, scale_pdf, @@ -22,7 +21,6 @@ from ..utils import ( merge_close_lines, get_table_index, compute_accuracy, - compute_whitespace, ) from ..image_processing import ( adaptive_threshold, @@ -80,7 +78,7 @@ class Lattice(BaseParser): Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa threshold_constant : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. @@ -114,6 +112,7 @@ class Lattice(BaseParser): resolution=300, **kwargs ): + super().__init__("lattice") self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background @@ -208,19 +207,6 @@ class Lattice(BaseParser): t.cells[i][j].text = t.cells[i - 1][j].text return t - def _generate_image(self): - from ..ext.ghostscript import Ghostscript - - self.imagename = "".join([self.rootname, ".png"]) - gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( - self.imagename, self.filename - ) - gs_call = gs_call.encode().split() - null = open(os.devnull, "wb") - with Ghostscript(*gs_call, stdout=null) as gs: - pass - null.close() - def _generate_table_bbox(self): def scale_areas(areas): scaled_areas = [] @@ -234,20 +220,21 @@ class Lattice(BaseParser): scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) return scaled_areas - self.image, self.threshold = adaptive_threshold( + self.pdf_image, self.threshold = adaptive_threshold( self.imagename, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant, ) - image_width = self.image.shape[1] - image_height = self.image.shape[0] + image_width = self.pdf_image.shape[1] + image_height = self.pdf_image.shape[0] image_width_scaler = image_width / float(self.pdf_width) image_height_scaler = image_height / float(self.pdf_height) pdf_width_scaler = self.pdf_width / float(image_width) pdf_height_scaler = self.pdf_height / float(image_height) - image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) + image_scalers = (image_width_scaler, + image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) if self.table_areas is None: @@ -291,7 +278,11 @@ class Lattice(BaseParser): self.table_bbox_unscaled = copy.deepcopy(table_bbox) - self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + [ + self.table_bbox, + self.vertical_segments, + self.horizontal_segments + ] = scale_image( table_bbox, vertical_segments, horizontal_segments, pdf_scalers ) @@ -315,7 +306,10 @@ class Lattice(BaseParser): rows.extend([tk[1], tk[3]]) # sort horizontal and vertical segments cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) - rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) + rows = merge_close_lines( + sorted(rows, reverse=True), + line_tol=self.line_tol + ) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] @@ -328,7 +322,7 @@ class Lattice(BaseParser): if v_s is None or h_s is None: raise ValueError("No segments found on {}".format(self.rootname)) - table = Table(cols, rows) + table = self._initialize_new_table(table_idx, cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) # set table border edges to True @@ -359,48 +353,44 @@ class Lattice(BaseParser): accuracy = compute_accuracy([[100, pos_errors]]) if self.copy_text is not None: - table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) + table = Lattice._copy_spanning_text( + table, + copy_text=self.copy_text + ) - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace = compute_whitespace(data) - table.flavor = "lattice" + table.fill_data(self) table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text - table._image = (self.image, self.table_bbox_unscaled) + table._image = (self.pdf_image, self.table_bbox_unscaled) table._segments = (self.vertical_segments, self.horizontal_segments) table._textedges = None return table - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): - self._generate_layout(filename, layout_kwargs) + def extract_tables(self, filename, page_idx=1, suppress_stdout=False, + layout_kwargs={}): + self._generate_layout(filename, page_idx, layout_kwargs) if not suppress_stdout: - logger.info("Processing {}".format(os.path.basename(self.rootname))) + logger.info(f"Processing {os.path.basename(self.rootname)}") if not self.horizontal_text: if self.images: warnings.warn( - "{} is image-based, camelot only works on" - " text-based pages.".format(os.path.basename(self.rootname)) + f"{os.path.basename(self.rootname)} is image-based, " + "camelot only works on text-based pages." ) else: warnings.warn( - "No tables found on {}".format(os.path.basename(self.rootname)) + f"No tables found on {os.path.basename(self.rootname)}" ) return [] - self._generate_image() + self._generate_image_file() self._generate_table_bbox() _tables = [] @@ -408,8 +398,10 @@ class Lattice(BaseParser): for table_idx, tk in enumerate( sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) ): - cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) - table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) + cols, rows, v_s, h_s = self._generate_columns_and_rows( + table_idx, tk) + table = self._generate_table( + table_idx, cols, rows, v_s=v_s, h_s=h_s) table._bbox = tk _tables.append(table) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index c939c8f..0d393f3 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd from .base import BaseParser -from ..core import TextEdges, Table +from ..core import TextEdges from ..utils import (text_in_bbox, get_table_index, compute_accuracy, compute_whitespace) @@ -69,6 +69,7 @@ class Stream(BaseParser): column_tol=0, **kwargs ): + super().__init__("stream") self.table_regions = table_regions self.table_areas = table_areas self.columns = columns @@ -120,21 +121,26 @@ class Stream(BaseParser): Two-dimensional list of text objects grouped into rows. """ - row_y = 0 + row_y = None rows = [] temp = [] - for t in text: + non_empty_text = [t for t in text if t.get_text().strip()] + for t in non_empty_text: # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs + # if t.get_text().strip() and all([obj.upright \ + # for obj in t._objs # if type(obj) is LTChar]): - if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=row_tol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - row_y = t.y0 - temp.append(t) + if row_y is not None and \ + not np.isclose(row_y, t.y0, atol=row_tol) and \ + 0.5 * (t.y1 + t.y0) < row_y: + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + # We update the row's bottom as we go, to be forgiving if there + # is a gradual change across multiple columns. + row_y = t.y0 + + temp.append(t) rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -278,7 +284,7 @@ class Stream(BaseParser): def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm described by Anssi Nurminen's master's thesis. - Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa Assumes that tables are situated relatively far apart vertically. @@ -378,12 +384,29 @@ class Stream(BaseParser): "No tables found in table area {}" .format(table_idx + 1) ) - cols = [ - (t.x0, t.x1) for r in rows_grouped if len(r) == ncols - for t in r + + # Identify rows which contain the mode of the number of columns + full_rows = list(filter( + lambda row: len(row) == ncols, + rows_grouped)) + cells_on_full_rows_xrange = [ + (t.x0, t.x1) for r in full_rows for t in r ] - cols = self._merge_columns(sorted(cols), + # TODO: fixme / make a decision on this + # plausible_rows = list(filter( + # lambda row: len(row) <= ncols*1.2 and len(row) >= ncols*.8, + # rows_grouped)) + # plausible_cells_xrange = [ + # (t.x0, t.x1) for r in plausible_rows for t in r + # ] + # self.debug_info['plausible_rows'] = plausible_rows + + # Identify column boundaries based on the contents of these rows + cols = self._merge_columns(sorted(cells_on_full_rows_xrange), column_tol=self.column_tol) + # cols = self._merge_columns(sorted(plausible_cells_xrange), + # column_tol=self.column_tol) + inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -409,7 +432,7 @@ class Stream(BaseParser): return cols, rows def _generate_table(self, table_idx, cols, rows, **kwargs): - table = Table(cols, rows) + table = self._initialize_new_table(table_idx, cols, rows) table = table.set_all_edges() pos_errors = [] @@ -431,31 +454,25 @@ class Stream(BaseParser): table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape + table.fill_data(self) - whitespace = compute_whitespace(data) - table.flavor = "stream" table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text - table._image = None + self.generate_image() + table._image = (self.pdf_image, self.table_bbox) table._segments = None table._textedges = self.textedges return table - def extract_tables(self, filename, suppress_stdout=False, + def extract_tables(self, filename, page_idx=1, suppress_stdout=False, layout_kwargs={}): - self._generate_layout(filename, layout_kwargs) + self._generate_layout(filename, page_idx, layout_kwargs) if not suppress_stdout: logger.info("Processing {}".format( os.path.basename(self.rootname))) @@ -474,6 +491,8 @@ class Stream(BaseParser): ) return [] + # Identify plausible areas within the doc where tables lie, + # populate table_bbox keys with these areas. self._generate_table_bbox() _tables = [] diff --git a/camelot/plotting.py b/camelot/plotting.py index 5e0dc0c..75d4449 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -37,7 +37,7 @@ class PlotMethods(object): raise NotImplementedError( "Lattice flavor does not support kind='{}'".format(kind) ) - elif table.flavor == "stream" and kind in ["joint", "line"]: + elif table.flavor == "stream" and kind in ["line"]: raise NotImplementedError( "Stream flavor does not support kind='{}'".format(kind) ) @@ -64,9 +64,18 @@ class PlotMethods(object): for t in table._text: xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) - ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + alpha=0.5 + ) + ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) + img, __ = table._image + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def grid(self, table): @@ -94,6 +103,9 @@ class PlotMethods(object): ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) + + img, __ = table._image + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def contour(self, table): @@ -109,12 +121,8 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - try: - img, table_bbox = table._image - _FOR_LATTICE = True - except TypeError: - img, table_bbox = (None, {table._bbox: None}) - _FOR_LATTICE = False + img, table_bbox = table._image + _FOR_LATTICE = table.flavor == "lattice" fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") @@ -132,7 +140,8 @@ class PlotMethods(object): for t in table_bbox.keys(): ax.add_patch( patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" + (t[0], t[1]), t[2] - t[0], t[3] - t[1], + fill=False, color="red" ) ) if not _FOR_LATTICE: @@ -143,6 +152,8 @@ class PlotMethods(object): if _FOR_LATTICE: ax.imshow(img) + else: + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def textedge(self, table): @@ -164,7 +175,11 @@ class PlotMethods(object): xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) ax.add_patch( - patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") + patches.Rectangle( + (t[0], t[1]), t[2] - t[0], t[3] - t[1], + color="blue", + alpha=0.5 + ) ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) @@ -172,6 +187,8 @@ class PlotMethods(object): for te in table._textedges: ax.plot([te.x, te.x], [te.y0, te.y1]) + img, __ = table._image + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def joint(self, table): @@ -220,4 +237,8 @@ class PlotMethods(object): ax.plot([v[0], v[2]], [v[1], v[3]]) for h in horizontal: ax.plot([h[0], h[2]], [h[1], h[3]]) + + img, __ = table._image + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) + return fig diff --git a/camelot/utils.py b/camelot/utils.py index e7ad848..bfc227f 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -13,6 +13,7 @@ from itertools import groupby from operator import itemgetter import numpy as np +import pandas as pd from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -30,6 +31,9 @@ from pdfminer.layout import ( ) +# pylint: disable=import-error +# PyLint will evaluate both branches, and will necessarily complain about one +# of them. PY3 = sys.version_info[0] >= 3 if PY3: from urllib.request import urlopen @@ -310,7 +314,8 @@ def get_rotation(chars, horizontal_text, vertical_text): if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) - rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" + rotation = "anticlockwise" if clockwise < anticlockwise \ + else "clockwise" return rotation @@ -341,12 +346,16 @@ def segments_in_bbox(bbox, v_segments, h_segments): v_s = [ v for v in v_segments - if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 + if v[1] > lb[1] - 2 and + v[3] < rt[1] + 2 and + lb[0] - 2 <= v[0] <= rt[0] + 2 ] h_s = [ h for h in h_segments - if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 + if h[0] > lb[0] - 2 and + h[2] < rt[0] + 2 and + lb[1] - 2 <= h[1] <= rt[1] + 2 ] return v_s, h_s @@ -464,10 +473,10 @@ def flag_font_size(textline, direction, strip_text=""): for t in textline if not isinstance(t, LTAnno) ] - l = [np.round(size, decimals=6) for text, size in d] - if len(set(l)) > 1: + text_sizes = [np.round(size, decimals=6) for text, size in d] + if len(set(text_sizes)) > 1: flist = [] - min_size = min(l) + min_size = min(text_sizes) for key, chars in groupby(d, itemgetter(1)): if key == min_size: fchars = [t[0] for t in chars] @@ -511,7 +520,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): of row/column and text is the an lttextline substring. """ - idx = 0 cut_text = [] bbox = textline.bbox try: @@ -528,7 +536,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): ] r = r_idx[0] x_cuts = [ - (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right + (c, table.cells[r][c].x2) + for c in x_overlap + if table.cells[r][c].right ] if not x_cuts: x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] @@ -561,7 +571,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): ] c = c_idx[0] y_cuts = [ - (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom + (r, table.cells[r][c].y1) + for r in y_overlap + if table.cells[r][c].bottom ] if not y_cuts: y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] @@ -644,9 +656,8 @@ def get_table_index( """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): - if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ - r - ][1]: + if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \ + (t.y0 + t.y1) / 2.0 > table.rows[r][1]: lt_col_overlap = [] for c in table.cols: if c[0] <= t.x1 and c[1] >= t.x0: @@ -681,7 +692,9 @@ def get_table_index( X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) charea = X * Y - error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea + error = ( + (X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset)) + ) / charea if split_text: return ( @@ -697,13 +710,16 @@ def get_table_index( ( r_idx, c_idx, - flag_font_size(t._objs, direction, strip_text=strip_text), + flag_font_size(t._objs, + direction, + strip_text=strip_text), ) ], error, ) else: - return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \ + error def compute_accuracy(error_weights): @@ -751,7 +767,6 @@ def compute_whitespace(d): """ whitespace = 0 - r_nempty_cells, c_nempty_cells = [], [] for i in d: for j in i: if j.strip() == "": @@ -852,3 +867,78 @@ def get_text_objects(layout, ltype="char", t=None): except AttributeError: pass return t + + +def compare_tables(left, right): + """Compare two tables and displays differences in a human readable form. + + Parameters + ---------- + left : data frame + right : data frame + """ + diff_cols = right.shape[1]-left.shape[1] + diff_rows = right.shape[0]-left.shape[0] + differences = [] + if (diff_rows): + differences.append( + f"{abs(diff_rows)} " + f"{'more' if diff_rows>0 else 'fewer'} rows" + ) + if (diff_cols): + differences.append( + f"{abs(diff_cols)} " + f"{'more' if diff_cols>0 else 'fewer'} columns" + ) + if differences: + differences_str = " and ".join(differences) + print(f"Right has {differences_str} than left " + f"[{right.shape[0]},{right.shape[1]}] vs " + f"[{left.shape[0]},{left.shape[1]}]") + + table1, table2 = [left, right] + name_table1, name_table2 = ["left", "right"] + if not diff_rows: + # Same number of rows: compare columns since they're of the same length + if diff_cols > 0: + # Use the longest table as a reference + table1, table2 = table2, table1 + name_table1, name_table2 = name_table2, name_table1 + for i, col in enumerate(table1.columns): + lcol = table1.iloc[:, i] + if col in table2: + scol = table2.iloc[:, i] + if not lcol.equals(scol): + diff_df = pd.DataFrame() + diff_df[name_table1] = scol + diff_df[name_table2] = lcol + diff_df["Match"] = lcol == scol + print( + f"Column {i} different:\n" + f"{diff_df}" + ) + break + else: + print("Column {i} unique to {name_table1}: {lcol}") + break + elif not diff_cols: + # Same number of cols: compare rows since they're of the same length + if diff_rows > 0: + # Use the longest table as a reference + table1, table2 = table2, table1 + name_table1, name_table2 = name_table2, name_table1 + for i in table1.iterrows(): + lrow = table1.loc[i, :] + if i < table2.shape[1]: + srow = table2.loc[i, :] + if not lrow.equals(srow): + diff_df = pd.DataFrame() + diff_df = diff_df.append(lrow, ignore_index=True) + diff_df = diff_df.append(srow, ignore_index=True) + diff_df.insert(0, 'Table', [name_table1, name_table2]) + print(f"Column {i} differs:") + print(diff_df.values) + break + else: + print(f"Row {i} unique to {name_table1}: {lrow}") + break diff --git a/tests/data.py b/tests/data.py index 679c35c..dfa69f1 100755 --- a/tests/data.py +++ b/tests/data.py @@ -838,7 +838,7 @@ data_stream_two_tables_1 = [ "2,330 .9", ], [ - "Violent crime . . . . . . . .\n . .\n . .\n . .\n" \ + "Violent crime . . . . . . . .\n . .\n . .\n . .\n" " . .\n . .", "467 .9", "69 .1", @@ -1503,15 +1503,8 @@ data_stream_table_areas = [ ] data_stream_columns = [ - [ - "Clave", - "Nombre Entidad", - "Clave", - "Nombre Municipio", - "Clave", - "Nombre Localidad", - ], - ["Entidad", "", "Municipio", "", "Localidad", ""], + ["Clave \nEntidad", "Nombre Entidad", "Clave \nMunicipio", + "Nombre Municipio", "Clave \nLocalidad", "Nombre Localidad"], ["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"], ["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"], ["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"], @@ -2732,11 +2725,9 @@ data_stream_vertical_headers = [ ['', '', '', '', '', '', '', '', '', '', '', 'Congress-', 'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '', 'Distri', 'Dist', '', '', 'Dist'], - ['', '', '', '', '', '', '', '', '', '', '1st Dist', '', 'Dist.', - 'Dist.', '', 'Deeds', '', 'Commission', '', 'District #1', - 'ct #2', '#3', 'Dist #4', '', '#5'], - ['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '', '', - '', '', '', '', '', '', '', '', '', '', '', '', '', ''], + ['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '', + '1st Dist', '', 'Dist.', 'Dist.', '', 'Deeds', '', 'Commission', + '', 'District #1', 'ct #2', '#3', 'Dist #4', '', '#5'], ['', 'Number of Registered voters', 'Poll Book Totals', 'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette', 'John James', 'Sandy Pensler', '', 'Jack Bergman', '', diff --git a/tests/files/baseline_plots/test_grid_plot.png b/tests/files/baseline_plots/test_grid_plot.png index 0607d15..87fe2aa 100644 Binary files a/tests/files/baseline_plots/test_grid_plot.png and b/tests/files/baseline_plots/test_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot.png b/tests/files/baseline_plots/test_line_plot.png index 12c44c0..6ddeace 100644 Binary files a/tests/files/baseline_plots/test_line_plot.png and b/tests/files/baseline_plots/test_line_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index bfa6133..65cdc72 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_grid_plot.png b/tests/files/baseline_plots/test_stream_grid_plot.png new file mode 100644 index 0000000..1afdf3f Binary files /dev/null and b/tests/files/baseline_plots/test_stream_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png index 63b5520..497af37 100644 Binary files a/tests/files/baseline_plots/test_text_plot.png and b/tests/files/baseline_plots/test_text_plot.png differ diff --git a/tests/files/baseline_plots/test_textedge_plot.png b/tests/files/baseline_plots/test_textedge_plot.png index 6bb93e0..ed7a156 100644 Binary files a/tests/files/baseline_plots/test_textedge_plot.png and b/tests/files/baseline_plots/test_textedge_plot.png differ diff --git a/tests/test_common.py b/tests/test_common.py index 375e43a..20941e8 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -9,10 +9,12 @@ from pandas.testing import assert_frame_equal import camelot from camelot.core import Table, TableList +from camelot.utils import compare_tables from camelot.__version__ import generate_version from .data import * + import pdfminer # The version of PDFMiner has an impact on some of the tests. Unfortunately, @@ -48,9 +50,11 @@ def test_password(): filename = os.path.join(testdir, "health_protected.pdf") tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") + assert len(tables) == 1 assert_frame_equal(df, tables[0].df) tables = camelot.read_pdf(filename, password="userpass", flavor="stream") + assert len(tables) == 1 assert_frame_equal(df, tables[0].df) @@ -59,6 +63,7 @@ def test_stream(): filename = os.path.join(testdir, "health.pdf") tables = camelot.read_pdf(filename, flavor="stream") + assert len(tables) == 1 assert_frame_equal(df, tables[0].df) @@ -79,6 +84,7 @@ def test_stream_table_rotated(): filename = os.path.join(testdir, "anticlockwise_table_2.pdf") tables = camelot.read_pdf(filename, flavor="stream") + assert len(tables) == 1 result_without_first_row = pd.DataFrame( tables[0].df.drop(tables[0].df.columns[0], axis=1).values) assert_frame_equal(df, result_without_first_row) @@ -275,9 +281,9 @@ def test_repr(): tables = camelot.read_pdf(filename) assert repr(tables) == "" assert repr(tables[0]) == "" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" def test_pages(): @@ -285,22 +291,23 @@ def test_pages(): tables = camelot.read_pdf(url) assert repr(tables) == "" assert repr(tables[0]) == "
" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" tables = camelot.read_pdf(url, pages="1-end") assert repr(tables) == "" assert repr(tables[0]) == "
" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" tables = camelot.read_pdf(url, pages="all") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == + "" ) @@ -310,7 +317,8 @@ def test_url(): assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == + "" ) diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 7c5136b..7e2fd20 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -43,6 +43,13 @@ def test_grid_plot(): tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind='grid') +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_stream_grid_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + return camelot.plot(tables[0], kind='grid') + @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True)