Merge pull request #224 from socialcopsdev/fix-207

[MRG] Add plot types and update docs
pull/2/head
Vinayak Mehta 2018-12-12 08:53:58 +05:30 committed by GitHub
commit 50780e24f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 123 additions and 8 deletions

View File

@ -4,6 +4,18 @@ Release History
master master
------ ------
**Improvements**
* [#207](https://github.com/socialcopsdev/camelot/issues/207) Add a plot type for Stream text edges and detected table areas. [#224](https://github.com/socialcopsdev/camelot/pull/224) by Vinayak Mehta.
**Bugfixes**
* [#217](https://github.com/socialcopsdev/camelot/issues/217) Fix IndexError when scale is large.
**Documentation**
* Add pdfplumber comparison and update Tabula (stream) comparison. Check out the [wiki page](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
0.4.1 (2018-12-05) 0.4.1 (2018-12-05)
------------------ ------------------

View File

@ -138,7 +138,7 @@ def lattice(c, *args, **kwargs):
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
' used to combine text horizontally, to generate columns.') ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid']), type=click.Choice(['text', 'grid', 'contour', 'textedge']),
help='Plot elements found on PDF page for visual debugging.') help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True)) @click.argument('filepath', type=click.Path(exists=True))
@pass_config @pass_config

View File

@ -341,6 +341,7 @@ class Lattice(BaseParser):
table._text = _text table._text = _text
table._image = (self.image, self.table_bbox_unscaled) table._image = (self.image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments) table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table return table

View File

@ -263,6 +263,7 @@ class Stream(BaseParser):
textedges.generate(textlines) textedges.generate(textlines)
# select relevant edges # select relevant edges
relevant_textedges = textedges.get_relevant() relevant_textedges = textedges.get_relevant()
self.textedges.extend(relevant_textedges)
# guess table areas using textlines and relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found # treat whole page as table area if no table areas found
@ -272,6 +273,7 @@ class Stream(BaseParser):
return table_bbox return table_bbox
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is not None: if self.table_areas is not None:
table_bbox = {} table_bbox = {}
for area in self.table_areas: for area in self.table_areas:
@ -378,6 +380,7 @@ class Stream(BaseParser):
table._text = _text table._text = _text
table._image = None table._image = None
table._segments = None table._segments = None
table._textedges = self.textedges
return table return table

View File

@ -33,7 +33,10 @@ class PlotMethods(object):
if not _HAS_MPL: if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.') raise ImportError('matplotlib is required for plotting.')
if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']: if table.flavor == 'lattice' and kind in ['textedge']:
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
kind))
elif table.flavor == 'stream' and kind in ['joint', 'line']:
raise NotImplementedError("Stream flavor does not support kind='{}'".format( raise NotImplementedError("Stream flavor does not support kind='{}'".format(
kind)) kind))
@ -114,22 +117,84 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
try:
img, table_bbox = table._image img, table_bbox = table._image
_FOR_LATTICE = True
except TypeError:
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
if not _FOR_LATTICE:
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
color='blue'
)
)
for t in table_bbox.keys(): for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), (t[0], t[1]),
t[2] - t[0], t[2] - t[0],
t[3] - t[1], t[3] - t[1],
fill=None, fill=False,
edgecolor='red' color='red'
) )
) )
if not _FOR_LATTICE:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
if _FOR_LATTICE:
ax.imshow(img) ax.imshow(img)
return fig return fig
def textedge(self, table):
"""Generates a plot for relevant textedges.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
color='blue'
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
for te in table._textedges:
ax.plot([te.x, te.x],
[te.y0, te.y1])
return fig
def joint(self, table): def joint(self, table):
"""Generates a plot for all line intersections present """Generates a plot for all line intersections present
on the PDF page. on the PDF page.

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

View File

@ -41,8 +41,9 @@ You can specify the type of element you want to plot using the ``kind`` keyword
- 'contour' - 'contour'
- 'line' - 'line'
- 'joint' - 'joint'
- 'textedge'
.. note:: The last three plot types can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``. .. note:: 'line' and 'joint' can only be used with :ref:`Lattice <lattice>` and 'textedge' can only be used with :ref:`Stream <stream>`.
Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
@ -143,6 +144,23 @@ Finally, let's plot all line intersections present on the table's PDF page.
:alt: A plot of all line intersections on a PDF page :alt: A plot of all line intersections on a PDF page
:align: left :align: left
textedge
^^^^^^^^
You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_.
::
>>> camelot.plot(tables[0], kind='textedge')
>>> plt.show()
.. figure:: ../_static/png/plot_textedge.png
:height: 674
:width: 1366
:scale: 50%
:alt: A plot of relevant textedges on a PDF page
:align: left
Specify table areas Specify table areas
------------------- -------------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

View File

@ -29,12 +29,20 @@ def test_grid_plot():
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_contour_plot(): def test_lattice_contour_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind='contour') return camelot.plot(tables[0], kind='contour')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_stream_contour_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
return camelot.plot(tables[0], kind='contour')
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_line_plot(): def test_line_plot():
@ -49,3 +57,11 @@ def test_joint_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind='joint') return camelot.plot(tables[0], kind='joint')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
return camelot.plot(tables[0], kind='textedge')