diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 9410614..17b51a9 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -5,6 +5,7 @@ from __future__ import division import numpy as np import copy +import math from .base import TextBaseParser from ..core import ( @@ -120,7 +121,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): ) ) ) - if max_spread <= MAX_COL_SPREAD_IN_HEADER: + + # Accept textlines that cross columns boundaries, as long as they + # cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of + # columns. + # This is to avoid picking unrelated paragraphs. + if max_spread <= min( + MAX_COL_SPREAD_IN_HEADER, + math.ceil(len(col_anchors) / 2) + ): # Combined, the elements we've identified don't cross more # than the authorized number of columns. # We're trying to avoid diff --git a/camelot/plotting.py b/camelot/plotting.py index 81d9694..12ba457 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -65,9 +65,9 @@ def draw_pdf(table, ax, to_pdf_scale=True): ---------- table : camelot.core.Table - ax : matplotlib.axes.Axes + ax : matplotlib.axes.Axes (optional) - to_pdf_scale : bool + to_pdf_scale : bool (optional) """ img = table.get_pdf_image() @@ -83,6 +83,7 @@ def draw_parse_constraints(table, ax): Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) ax : matplotlib.axes.Axes @@ -110,8 +111,33 @@ def draw_parse_constraints(table, ax): ) +def prepare_plot(table, ax=None, to_pdf_scale=True): + """Initialize plot and draw common components + + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + to_pdf_scale : + + ax : matplotlib.axes.Axes + + to_pdf_scale : bool (optional) + + Returns + ------- + ax : matplotlib.axes.Axes + """ + if ax is None: + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + draw_pdf(table, ax, to_pdf_scale) + draw_parse_constraints(table, ax) + return ax + + class PlotMethods(object): - def __call__(self, table, kind="text", filename=None): + def __call__(self, table, kind="text", filename=None, ax=None): """Plot elements found on PDF page based on kind specified, useful for debugging and playing with different parameters to get the best output. @@ -144,26 +170,24 @@ class PlotMethods(object): ) plot_method = getattr(self, kind) - return plot_method(table) + return plot_method(table, ax) @staticmethod - def text(table): + def text(table, ax=None): """Generates a plot for all text elements present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax) xs, ys = [], [] for t in table._text: xs.extend([t[0], t[2]]) @@ -178,26 +202,24 @@ class PlotMethods(object): ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) - return fig + return ax.get_figure() @staticmethod - def grid(table): + def grid(table, ax=None): """Generates a plot for the detected table grids on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax) for row in table.cells: for cell in row: if cell.left: @@ -208,27 +230,25 @@ class PlotMethods(object): ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) - return fig + return ax.get_figure() @staticmethod - def contour(table): + def contour(table, ax=None): """Generates a plot for all table boundaries present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") _FOR_LATTICE = table.flavor == "lattice" - draw_pdf(table, ax, to_pdf_scale=not _FOR_LATTICE) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE) if _FOR_LATTICE: table_bbox = table._bbox_unscaled @@ -260,25 +280,23 @@ class PlotMethods(object): ys.extend([t[1], t[3]]) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) - return fig + return ax.get_figure() @staticmethod - def textedge(table): + def textedge(table, ax=None): """Generates a plot for relevant textedges. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax) xs, ys = [], [] for t in table._text: xs.extend([t[0], t[2]]) @@ -352,26 +370,24 @@ class PlotMethods(object): else: for te in table._textedges: ax.plot([te.coord, te.coord], [te.y0, te.y1]) - return fig + return ax.get_figure() @staticmethod - def joint(table): + def joint(table, ax=None): """Generates a plot for all line intersections present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax, to_pdf_scale=False) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax, to_pdf_scale=False) table_bbox = table._bbox_unscaled x_coord = [] y_coord = [] @@ -380,53 +396,48 @@ class PlotMethods(object): x_coord.append(coord[0]) y_coord.append(coord[1]) ax.plot(x_coord, y_coord, "ro") - return fig + return ax.get_figure() @staticmethod - def line(table): + def line(table, ax=None): """Generates a plot for all line segments present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax) - draw_parse_constraints(table, ax) + ax = prepare_plot(table, ax) vertical, horizontal = table._segments for v in vertical: ax.plot([v[0], v[2]], [v[1], v[3]]) for h in horizontal: ax.plot([h[0], h[2]], [h[1], h[3]]) - return fig + return ax.get_figure() @staticmethod - def hybrid_table_search(table): + def hybrid_table_search(table, ax=None): """Generates a plot illustrating the steps of the hybrid table search. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - draw_pdf(table, ax) - draw_parse_constraints(table, ax) - + ax = prepare_plot(table, ax) if table.parse_details is None: - return fig + return ax.get_figure() parse_details = table.parse_details for box_id, bbox_search in enumerate(parse_details["bbox_searches"]): max_h_gap = bbox_search["max_h_gap"] @@ -476,4 +487,4 @@ class PlotMethods(object): label_pos="bottom,left" ) - return fig + return ax.get_figure() diff --git a/tests/data.py b/tests/data.py index 1205de7..ccb0d7e 100755 --- a/tests/data.py +++ b/tests/data.py @@ -1609,6 +1609,22 @@ data_stream_two_tables_2 = [ ["1 Except forcible rape and prostitution.", "", "", "", "", ""], ] +data_hybrid_two_tables_b_1 = [ + ["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"], + ["Vgvhgh", "Hj", "Hj", "Hj"], + ["Hj", "Hj", "Hj", "Hj"], + ["Hj", "Hj", "J", "Hj"], + ["V", "C", "D", "Gfhj"], + ["Hjb", "B", "Jhbh", "Hj"], + ["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"], +] + +data_hybrid_two_tables_b_2 = [ + ["Trtrt", "H", "Gh"], + ["Gh", "V", "Hv"], + ["Hv", "Bhjb", "hg"], +] + # The streaming algorithm incorrectly includes a header and a footer. # Trimming the table for the test of hybrid, which doesn't include it. data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1] diff --git a/tests/files/camelot-issue-132-multiple-tables.pdf b/tests/files/camelot-issue-132-multiple-tables.pdf new file mode 100644 index 0000000..0b9d854 Binary files /dev/null and b/tests/files/camelot-issue-132-multiple-tables.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 3d38788..10407a5 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -175,7 +175,7 @@ def test_hybrid_table_rotated(): assert_frame_equal(df, tables[0].df) -def test_hybrid_two_tables(): +def test_hybrid_two_tables_a(): df1 = pd.DataFrame(data_hybrid_two_tables_1) df2 = pd.DataFrame(data_hybrid_two_tables_2) @@ -187,6 +187,19 @@ def test_hybrid_two_tables(): assert df2.equals(tables[1].df) +# Reported as https://github.com/camelot-dev/camelot/issues/132 +def test_hybrid_two_tables_b(): + df1 = pd.DataFrame(data_hybrid_two_tables_b_1) + df2 = pd.DataFrame(data_hybrid_two_tables_b_2) + + filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + def test_hybrid_table_regions(): df = pd.DataFrame(data_hybrid_table_regions)