From f54e1563e1e63cb64bc71c51e4f727bad0435a1b Mon Sep 17 00:00:00 2001 From: Francois Huet Date: Mon, 6 Apr 2020 12:47:23 -0700 Subject: [PATCH] Lint and address PDFMiner version impact on tests --- camelot/core.py | 126 +++++++++++++++++++++++--------------- camelot/parsers/stream.py | 51 +++++++++------ tests/data.py | 35 +++++++---- tests/test_common.py | 50 ++++++++++++--- tests/test_plotting.py | 21 +++++++ 5 files changed, 197 insertions(+), 86 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index a0cc079..fe52411 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -17,6 +17,7 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4 # maximum number of columns over which a header can spread MAX_COL_SPREAD_IN_HEADER = 3 + class TextEdge(object): """Defines a text edge coordinates relative to a left-bottom origin. (PDF coordinate space) @@ -64,7 +65,8 @@ class TextEdge(object): the is_valid attribute. """ if np.isclose(self.y0, y0, atol=edge_tol): - self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.x = (self.intersections * self.x + x) / \ + float(self.intersections + 1) self.y0 = y0 self.intersections += 1 # a textedge is valid only if it extends uninterrupted @@ -140,26 +142,38 @@ class TextEdges(object): """ intersections_sum = { "left": sum( - te.intersections for te in self._textedges["left"] if te.is_valid + te.intersections for te in self._textedges["left"] + if te.is_valid ), "right": sum( - te.intersections for te in self._textedges["right"] if te.is_valid + te.intersections for te in self._textedges["right"] + if te.is_valid ), "middle": sum( - te.intersections for te in self._textedges["middle"] if te.is_valid + te.intersections for te in self._textedges["middle"] + if te.is_valid ), } # TODO: naive # get vertical textedges that intersect maximum number of # times with horizontal textlines - relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] - return list(filter(lambda te: te.is_valid, self._textedges[relevant_align])) + relevant_align = max( + intersections_sum.items(), + key=itemgetter(1) + )[0] + return list(filter( + lambda te: te.is_valid, + self._textedges[relevant_align]) + ) - def _expand_area_for_header(self, area, textlines, col_anchors, average_row_height): - """The core algorithm is based on fairly strict alignment of text. It works - ok for the table body, but might fail on tables' headers since they - tend to be in a different font, alignment (e.g. vertical), etc. + @staticmethod + def _expand_area_for_header(area, textlines, col_anchors, + average_row_height): + """The core algorithm is based on fairly strict alignment of text. + It works ok for the table body, but might fail on tables' headers + since they tend to be in a different font, alignment (e.g. vertical), + etc. The section below tries to identify whether what's above the bbox identified so far has the characteristics of a table header: Close to the top of the body, with cells that fit within the bounds @@ -174,10 +188,12 @@ class TextEdges(object): crossed by an element covering left to right. """ indexLeft = 0 - while indexLeft < len(col_anchors) and col_anchors[indexLeft] < left: + while indexLeft < len(col_anchors) \ + and col_anchors[indexLeft] < left: indexLeft += 1 indexRight = indexLeft - while indexRight < len(col_anchors) and col_anchors[indexRight] < right: + while indexRight < len(col_anchors) \ + and col_anchors[indexRight] < right: indexRight += 1 return indexRight - indexLeft @@ -193,14 +209,14 @@ class TextEdges(object): # higher than the table, directly within its bounds if te.y0 > top and te.x0 > left and te.x1 < right: all_above.append(te) - if closest_above == None or closest_above.y0 > te.y0: + if closest_above is None or closest_above.y0 > te.y0: closest_above = te if closest_above and \ closest_above.y0 < top + average_row_height: - # b/ We have a candidate cell that is within the correct vertical band, - # and directly above the table. Starting from this anchor, we list - # all the textlines within the same row. + # b/ We have a candidate cell that is within the correct + # vertical band, and directly above the table. Starting from + # this anchor, we list all the textlines within the same row. tls_in_new_row = [] top = closest_above.y1 pushed_up = True @@ -222,18 +238,20 @@ class TextEdges(object): top = te.y1 pushed_up = True - # Get the x-ranges for all the textlines, and merge the x-ranges that overlap + # Get the x-ranges for all the textlines, and merge the + # x-ranges that overlap zones = zones + \ list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row)) zones.sort(key=lambda z: z[0]) # Sort by left coordinate - # Starting from the right, if two zones overlap horizontally, merge them + # Starting from the right, if two zones overlap horizontally, + # merge them merged_something = True while merged_something: merged_something = False for i in range(len(zones) - 1, 0, -1): zone_right = zones[i] zone_left = zones[i-1] - if (zone_left[1] >= zone_right[0]): + if zone_left[1] >= zone_right[0]: zone_left[1] = max(zone_right[1], zone_left[1]) zones.pop(i) merged_something = True @@ -248,8 +266,8 @@ class TextEdges(object): ) ) if max_spread <= MAX_COL_SPREAD_IN_HEADER: - # Combined, the elements we've identified don't cross more than the - # authorized number of columns. + # Combined, the elements we've identified don't cross more + # than the authorized number of columns. # We're trying to avoid # 0: # 1: @@ -257,7 +275,8 @@ class TextEdges(object): # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS: new_area = (left, bottom, right, top) - # At this stage we've identified a plausible row (or beginning of one). + # At this stage we've identified a plausible row (or the + # beginning of one). keep_searching = True return new_area @@ -272,26 +291,26 @@ class TextEdges(object): table_areas = {} for te in relevant_textedges: - if not table_areas: + if not table_areas: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + found = None + for area in table_areas: + # check for overlap + if te.y1 >= area[1] and te.y0 <= area[3]: + found = area + break + if found is None: table_areas[(te.x, te.y0, te.x, te.y1)] = None else: - found = None - for area in table_areas: - # check for overlap - if te.y1 >= area[1] and te.y0 <= area[3]: - found = area - break - if found is None: - table_areas[(te.x, te.y0, te.x, te.y1)] = None - else: - table_areas.pop(found) - updated_area = ( - found[0], - min(te.y0, found[1]), - max(found[2], te.x), - max(found[3], te.y1), - ) - table_areas[updated_area] = None + table_areas.pop(found) + updated_area = ( + found[0], + min(te.y0, found[1]), + max(found[2], te.x), + max(found[3], te.y1), + ) + table_areas[updated_area] = None # extend table areas based on textlines that overlap # vertically. it's possible that these textlines were @@ -318,8 +337,8 @@ class TextEdges(object): ) table_areas[updated_area] = None - # Apply a heuristic to salvage headers which formatting might be off compared to - # the rest of the table. + # Apply a heuristic to salvage headers which formatting might be off + # compared to the rest of the table. average_textline_height = sum_textline_height / \ float(len(textlines)) @@ -398,7 +417,10 @@ class Cell(object): def __repr__(self): return "".format( - round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2) + round(self.x1, 2), + round(self.y1, 2), + round(self.x2, 2), + round(self.y2, 2) ) @property @@ -448,7 +470,9 @@ class Table(object): def __init__(self, cols, rows): self.cols = cols self.rows = rows - self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] + self.cells = [ + [Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows + ] self.df = None self.shape = (0, 0) self.accuracy = 0 @@ -685,7 +709,8 @@ class Table(object): Output filepath. """ - kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} + kw = {"encoding": "utf-8", "index": False, "header": False, + "quoting": 1} kw.update(kwargs) self.df.to_csv(path, **kw) @@ -798,7 +823,8 @@ class TableList(object): ext = kwargs.get("ext") for table in self._tables: filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + "{}-page-{}-table-{}{}".format(root, table.page, table.order, + ext) ) filepath = os.path.join(dirname, filename) to_format = self._format_func(table, f) @@ -813,7 +839,10 @@ class TableList(object): with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: for table in self._tables: filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + "{}-page-{}-table-{}{}".format(root, + table.page, + table.order, + ext) ) filepath = os.path.join(dirname, filename) z.write(filepath, os.path.basename(filepath)) @@ -848,7 +877,8 @@ class TableList(object): writer = pd.ExcelWriter(filepath) for table in self._tables: sheet_name = "page-{}-table-{}".format(table.page, table.order) - table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") + table.df.to_excel(writer, sheet_name=sheet_name, + encoding="utf-8") writer.save() if compress: zipname = os.path.join(os.path.dirname(path), root) + ".zip" diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 33e3692..c939c8f 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -10,7 +10,8 @@ import pandas as pd from .base import BaseParser from ..core import TextEdges, Table -from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace +from ..utils import (text_in_bbox, get_table_index, compute_accuracy, + compute_whitespace) logger = logging.getLogger("camelot") @@ -124,8 +125,8 @@ class Stream(BaseParser): temp = [] for t in text: # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs if - # type(obj) is LTChar]): + # if t.get_text().strip() and all([obj.upright for obj in t._objs + # if type(obj) is LTChar]): if t.get_text().strip(): if not np.isclose(row_y, t.y0, atol=row_tol): rows.append(sorted(temp, key=lambda t: t.x0)) @@ -170,7 +171,8 @@ class Stream(BaseParser): merged.append(higher) elif column_tol < 0: if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(column_tol)): + if np.isclose(higher[0], lower[1], + atol=abs(column_tol)): merged.append(higher) else: upper_bound = max(lower[1], higher[1]) @@ -200,8 +202,8 @@ class Stream(BaseParser): """ row_boundaries = [ [ - max([t.y1 for t in r]), - min([t.y0 for t in r]) + max(t.y1 for t in r), + min(t.y0 for t in r) ] for r in rows_grouped ] @@ -236,7 +238,9 @@ class Stream(BaseParser): text = Stream._group_rows(text, row_tol=row_tol) elements = [len(r) for r in text] new_cols = [ - (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r + (t.x0, t.x1) + for r in text if len(r) == max(elements) + for t in r ] cols.extend(Stream._merge_columns(sorted(new_cols))) return cols @@ -268,7 +272,8 @@ class Stream(BaseParser): def _validate_columns(self): if self.table_areas is not None and self.columns is not None: if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" " should be equal") + raise ValueError("Length of table_areas and columns" + " should be equal") def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm @@ -290,7 +295,7 @@ class Stream(BaseParser): # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not len(table_bbox): + if not table_bbox: table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} return table_bbox @@ -339,7 +344,8 @@ class Stream(BaseParser): self.t_bbox = t_bbox - text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) + text_x_min, text_y_min, text_x_max, text_y_max = \ + self._text_bbox(self.t_bbox) rows_grouped = self._group_rows(t_bbox_all, row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -365,14 +371,19 @@ class Stream(BaseParser): # see if the list contains elements, if yes, then use # the mode after removing 1s elements = list(filter(lambda x: x != 1, elements)) - if len(elements): + if elements: ncols = max(set(elements), key=elements.count) else: warnings.warn( - "No tables found in table area {}".format(table_idx + 1) + "No tables found in table area {}" + .format(table_idx + 1) ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + cols = [ + (t.x0, t.x1) for r in rows_grouped if len(r) == ncols + for t in r + ] + cols = self._merge_columns(sorted(cols), + column_tol=self.column_tol) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -442,20 +453,24 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + def extract_tables(self, filename, suppress_stdout=False, + layout_kwargs={}): self._generate_layout(filename, layout_kwargs) if not suppress_stdout: - logger.info("Processing {}".format(os.path.basename(self.rootname))) + logger.info("Processing {}".format( + os.path.basename(self.rootname))) if not self.horizontal_text: if self.images: warnings.warn( "{} is image-based, camelot only works on" - " text-based pages.".format(os.path.basename(self.rootname)) + " text-based pages.".format( + os.path.basename(self.rootname)) ) else: warnings.warn( - "No tables found on {}".format(os.path.basename(self.rootname)) + "No tables found on {}".format( + os.path.basename(self.rootname)) ) return [] diff --git a/tests/data.py b/tests/data.py index f11aba4..889f98a 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2742,21 +2742,28 @@ data_stream_vertical_headers = [ '', 'Daniel G. Gauthier', 'Craig M. Clemens', 'Craig Johnston', 'Carolyn Brummund', 'Adam Brege', 'David Bielusiak', ''], ['Alcona', '963', '439', '55', '26', '47', '164', '173', '111', '', '268', - '', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '', ''], - ['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '', '244', - '', '247', '254', '255', '', '244', '', '139', '143', '', '', '', '', ''], + '', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '', + ''], + ['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '', + '244', '', '247', '254', '255', '', '244', '', '139', '143', '', '', + '', '', ''], ['Curtis', '1026', '349', '30', '30', '25', '102', '95', '84', '', '159', '', '164', '162', '161', '', '157', '', '', '', '', '', '', '', ''], - ['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '', '208', - '', '213', '214', '215', '', '208', '', '', '', '', '208', '', '', ''], + ['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '', + '208', '', '213', '214', '215', '', '208', '', '', '', '', '208', '', + '', ''], ['Gustin', '611', '180', '22', '35', '17', '55', '73', '45', '', '108', - '', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42', ''], - ['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '', '226', - '', '226', '232', '244', '', '226', '', '', '', '232', '', '', '', ''], + '', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42', + ''], + ['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '', + '226', '', '226', '232', '244', '', '226', '', '', '', '232', '', '', + '', ''], ['Hawes', '884', '293', '38', '36', '27', '109', '121', '84', '', '192', - '', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87', ''], + '', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87', + ''], ['Haynes', '626', '275', '31', '20', '32', '104', '121', '53', '', '163', - '', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31', ''], + '', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31', + ''], ['Mikado', '781', '208', '19', '39', '17', '81', '90', '63', '', '149', '', '149', '145', '147', '', '143', '', '', '', '', '113', '', '', ''], ['Millen', '353', '139', '7', '16', '13', '38', '49', '19', '', '62', @@ -2764,7 +2771,9 @@ data_stream_vertical_headers = [ ['Mitchell', '327', '96', '12', '17', '7', '29', '41', '17', '', '57', '', '55', '57', '60', '', '56', '', '', '', '', '', '', '', ''], ['City Harrisville', '389', '171', '16', '15', '18', '35', '49', '31', '', - '78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '', ''], - ['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0', '1914', '0', - '1934', '1967', '1963', '0', '1889', '0', '363', '219', '381', '321', '268', '160', '0'] + '78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '', + ''], + ['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0', + '1914', '0', '1934', '1967', '1963', '0', '1889', '0', '363', '219', + '381', '321', '268', '160', '0'] ] diff --git a/tests/test_common.py b/tests/test_common.py index 468a1f5..120e4bd 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -2,6 +2,8 @@ import os +import pytest + import pandas as pd from pandas.testing import assert_frame_equal @@ -11,12 +13,30 @@ from camelot.__version__ import generate_version from .data import * +import pdfminer + +# The version of PDFMiner has an impact on some of the tests. Unfortunately, +# we can't enforce usage of a recent version of PDFMiner without dropping +# support for Python 2. +# To check the version of pdfminer.six installed: +# pip freeze | grep pdfminer.six +# To force upgrade: +# pip install --upgrade --force-reinstall pdfminer.six +# To force usage of a Python 2 compatible version: +# pip install "pdfminer.six==20191110" +# This condition can be removed in favor of a version requirement bump for +# pdfminer.six once support for Python 2 is dropped. + +LEGACY_PDF_MINER = pdfminer.__version__ < "20200402" + testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") def test_parsing_report(): - parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} + parsing_report = { + "accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1 + } filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) @@ -64,6 +84,8 @@ def test_stream_table_rotated(): assert_frame_equal(df, result_without_first_row) +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") def test_stream_two_tables(): df1 = pd.DataFrame(data_stream_two_tables_1) df2 = pd.DataFrame(data_stream_two_tables_2) @@ -106,6 +128,8 @@ def test_stream_columns(): assert_frame_equal(df, tables[0].df) +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") def test_stream_split_text(): df = pd.DataFrame(data_stream_split_text) @@ -143,6 +167,8 @@ def test_stream_edge_tol(): assert_frame_equal(df, tables[0].df) +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") def test_stream_layout_kwargs(): df = pd.DataFrame(data_stream_layout_kwargs) @@ -248,7 +274,8 @@ def test_repr(): assert repr(tables) == "" assert repr(tables[0]) == "" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) + == "" ) @@ -258,21 +285,24 @@ def test_pages(): assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) + == "" ) tables = camelot.read_pdf(url, pages="1-end") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) + == "" ) tables = camelot.read_pdf(url, pages="all") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) + == "" ) @@ -282,7 +312,8 @@ def test_url(): assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) + == "" ) @@ -302,7 +333,12 @@ def test_table_order(): return t table_list = TableList( - [_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)] + [ + _make_table(2, 1), + _make_table(1, 1), + _make_table(3, 4), + _make_table(1, 2) + ] ) assert [(t.page, t.order) for t in sorted(table_list)] == [ diff --git a/tests/test_plotting.py b/tests/test_plotting.py index f267e29..7646894 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -4,13 +4,30 @@ import os import pytest +import pdfminer + import camelot +# The version of PDFMiner has an impact on some of the tests. Unfortunately, +# we can't enforce usage of a recent version of PDFMiner without dropping +# support for Python 2. +# To check the version of pdfminer.six installed: +# pip freeze | grep pdfminer.six +# To force upgrade: +# pip install --upgrade --force-reinstall pdfminer.six +# To force usage of a Python 2 compatible version: +# pip install "pdfminer.six==20191110" +# This condition can be removed in favor of a version requirement bump for +# pdfminer.six once support for Python 2 is dropped. + +LEGACY_PDF_MINER = pdfminer.__version__ < "20200402" testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_text_plot(): @@ -35,6 +52,8 @@ def test_lattice_contour_plot(): return camelot.plot(tables[0], kind='contour') +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_stream_contour_plot(): @@ -59,6 +78,8 @@ def test_joint_plot(): return camelot.plot(tables[0], kind='joint') +@pytest.mark.skipif(LEGACY_PDF_MINER, + reason="depends on a recent version of PDFMiner") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_textedge_plot():