diff --git a/HISTORY.md b/HISTORY.md index 4cf77ba..878dc08 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,18 @@ Release History master ------ +**Improvements** + +* [#207](https://github.com/socialcopsdev/camelot/issues/207) Add a plot type for Stream text edges and detected table areas. [#224](https://github.com/socialcopsdev/camelot/pull/224) by Vinayak Mehta. + +**Bugfixes** + +* [#217](https://github.com/socialcopsdev/camelot/issues/217) Fix IndexError when scale is large. + +**Documentation** + +* Add pdfplumber comparison and update Tabula (stream) comparison. Check out the [wiki page](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). + 0.4.1 (2018-12-05) ------------------ diff --git a/camelot/cli.py b/camelot/cli.py index eaae955..1b995aa 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -138,7 +138,7 @@ def lattice(c, *args, **kwargs): @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', - type=click.Choice(['text', 'grid']), + type=click.Choice(['text', 'grid', 'contour', 'textedge']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 22c77e8..14d8f6c 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -341,6 +341,7 @@ class Lattice(BaseParser): table._text = _text table._image = (self.image, self.table_bbox_unscaled) table._segments = (self.vertical_segments, self.horizontal_segments) + table._textedges = None return table diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3b9c068..b6785df 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -263,6 +263,7 @@ class Stream(BaseParser): textedges.generate(textlines) # select relevant edges relevant_textedges = textedges.get_relevant() + self.textedges.extend(relevant_textedges) # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found @@ -272,6 +273,7 @@ class Stream(BaseParser): return table_bbox def _generate_table_bbox(self): + self.textedges = [] if self.table_areas is not None: table_bbox = {} for area in self.table_areas: @@ -378,6 +380,7 @@ class Stream(BaseParser): table._text = _text table._image = None table._segments = None + table._textedges = self.textedges return table diff --git a/camelot/plotting.py b/camelot/plotting.py index 3b91cee..1320267 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -33,7 +33,10 @@ class PlotMethods(object): if not _HAS_MPL: raise ImportError('matplotlib is required for plotting.') - if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']: + if table.flavor == 'lattice' and kind in ['textedge']: + raise NotImplementedError("Lattice flavor does not support kind='{}'".format( + kind)) + elif table.flavor == 'stream' and kind in ['joint', 'line']: raise NotImplementedError("Stream flavor does not support kind='{}'".format( kind)) @@ -114,20 +117,82 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - img, table_bbox = table._image + try: + img, table_bbox = table._image + _FOR_LATTICE = True + except TypeError: + img, table_bbox = (None, {table._bbox: None}) + _FOR_LATTICE = False fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') + + xs, ys = [], [] + if not _FOR_LATTICE: + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + color='blue' + ) + ) + for t in table_bbox.keys(): ax.add_patch( patches.Rectangle( (t[0], t[1]), t[2] - t[0], t[3] - t[1], - fill=None, - edgecolor='red' + fill=False, + color='red' ) ) - ax.imshow(img) + if not _FOR_LATTICE: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + if _FOR_LATTICE: + ax.imshow(img) + return fig + + def textedge(self, table): + """Generates a plot for relevant textedges. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + color='blue' + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + for te in table._textedges: + ax.plot([te.x, te.x], + [te.y0, te.y1]) + return fig def joint(self, table): diff --git a/docs/_static/png/plot_textedge.png b/docs/_static/png/plot_textedge.png new file mode 100755 index 0000000..fb2f36b Binary files /dev/null and b/docs/_static/png/plot_textedge.png differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 9610408..d2c8b35 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -41,8 +41,9 @@ You can specify the type of element you want to plot using the ``kind`` keyword - 'contour' - 'line' - 'joint' +- 'textedge' -.. note:: The last three plot types can only be used with :ref:`Lattice `, i.e. when ``flavor='lattice'``. +.. note:: 'line' and 'joint' can only be used with :ref:`Lattice ` and 'textedge' can only be used with :ref:`Stream `. Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. @@ -143,6 +144,23 @@ Finally, let's plot all line intersections present on the table's PDF page. :alt: A plot of all line intersections on a PDF page :align: left +textedge +^^^^^^^^ + +You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_. + +:: + + >>> camelot.plot(tables[0], kind='textedge') + >>> plt.show() + +.. figure:: ../_static/png/plot_textedge.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of relevant textedges on a PDF page + :align: left + Specify table areas ------------------- diff --git a/tests/files/baseline_plots/test_lattice_contour_plot.png b/tests/files/baseline_plots/test_lattice_contour_plot.png new file mode 100644 index 0000000..57b3962 Binary files /dev/null and b/tests/files/baseline_plots/test_lattice_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png new file mode 100644 index 0000000..a6e77f7 Binary files /dev/null and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_textedge_plot.png b/tests/files/baseline_plots/test_textedge_plot.png new file mode 100644 index 0000000..63fc236 Binary files /dev/null and b/tests/files/baseline_plots/test_textedge_plot.png differ diff --git a/tests/test_plotting.py b/tests/test_plotting.py index eeea81a..f267e29 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -29,12 +29,20 @@ def test_grid_plot(): @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) -def test_contour_plot(): +def test_lattice_contour_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind='contour') +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_stream_contour_plot(): + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor='stream') + return camelot.plot(tables[0], kind='contour') + + @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_line_plot(): @@ -49,3 +57,11 @@ def test_joint_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind='joint') + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_textedge_plot(): + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor='stream') + return camelot.plot(tables[0], kind='textedge')