commit
50780e24f8
12
HISTORY.md
12
HISTORY.md
|
|
@ -4,6 +4,18 @@ Release History
|
|||
master
|
||||
------
|
||||
|
||||
**Improvements**
|
||||
|
||||
* [#207](https://github.com/socialcopsdev/camelot/issues/207) Add a plot type for Stream text edges and detected table areas. [#224](https://github.com/socialcopsdev/camelot/pull/224) by Vinayak Mehta.
|
||||
|
||||
**Bugfixes**
|
||||
|
||||
* [#217](https://github.com/socialcopsdev/camelot/issues/217) Fix IndexError when scale is large.
|
||||
|
||||
**Documentation**
|
||||
|
||||
* Add pdfplumber comparison and update Tabula (stream) comparison. Check out the [wiki page](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
||||
|
||||
0.4.1 (2018-12-05)
|
||||
------------------
|
||||
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ def lattice(c, *args, **kwargs):
|
|||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||
' used to combine text horizontally, to generate columns.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'grid']),
|
||||
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
||||
help='Plot elements found on PDF page for visual debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@pass_config
|
||||
|
|
|
|||
|
|
@ -341,6 +341,7 @@ class Lattice(BaseParser):
|
|||
table._text = _text
|
||||
table._image = (self.image, self.table_bbox_unscaled)
|
||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||
table._textedges = None
|
||||
|
||||
return table
|
||||
|
||||
|
|
|
|||
|
|
@ -263,6 +263,7 @@ class Stream(BaseParser):
|
|||
textedges.generate(textlines)
|
||||
# select relevant edges
|
||||
relevant_textedges = textedges.get_relevant()
|
||||
self.textedges.extend(relevant_textedges)
|
||||
# guess table areas using textlines and relevant edges
|
||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||
# treat whole page as table area if no table areas found
|
||||
|
|
@ -272,6 +273,7 @@ class Stream(BaseParser):
|
|||
return table_bbox
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
self.textedges = []
|
||||
if self.table_areas is not None:
|
||||
table_bbox = {}
|
||||
for area in self.table_areas:
|
||||
|
|
@ -378,6 +380,7 @@ class Stream(BaseParser):
|
|||
table._text = _text
|
||||
table._image = None
|
||||
table._segments = None
|
||||
table._textedges = self.textedges
|
||||
|
||||
return table
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,10 @@ class PlotMethods(object):
|
|||
if not _HAS_MPL:
|
||||
raise ImportError('matplotlib is required for plotting.')
|
||||
|
||||
if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']:
|
||||
if table.flavor == 'lattice' and kind in ['textedge']:
|
||||
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
|
||||
kind))
|
||||
elif table.flavor == 'stream' and kind in ['joint', 'line']:
|
||||
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
||||
kind))
|
||||
|
||||
|
|
@ -114,20 +117,82 @@ class PlotMethods(object):
|
|||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
img, table_bbox = table._image
|
||||
try:
|
||||
img, table_bbox = table._image
|
||||
_FOR_LATTICE = True
|
||||
except TypeError:
|
||||
img, table_bbox = (None, {table._bbox: None})
|
||||
_FOR_LATTICE = False
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
|
||||
xs, ys = [], []
|
||||
if not _FOR_LATTICE:
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
color='blue'
|
||||
)
|
||||
)
|
||||
|
||||
for t in table_bbox.keys():
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
fill=None,
|
||||
edgecolor='red'
|
||||
fill=False,
|
||||
color='red'
|
||||
)
|
||||
)
|
||||
ax.imshow(img)
|
||||
if not _FOR_LATTICE:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
|
||||
if _FOR_LATTICE:
|
||||
ax.imshow(img)
|
||||
return fig
|
||||
|
||||
def textedge(self, table):
|
||||
"""Generates a plot for relevant textedges.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
color='blue'
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
|
||||
for te in table._textedges:
|
||||
ax.plot([te.x, te.x],
|
||||
[te.y0, te.y1])
|
||||
|
||||
return fig
|
||||
|
||||
def joint(self, table):
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 68 KiB |
|
|
@ -41,8 +41,9 @@ You can specify the type of element you want to plot using the ``kind`` keyword
|
|||
- 'contour'
|
||||
- 'line'
|
||||
- 'joint'
|
||||
- 'textedge'
|
||||
|
||||
.. note:: The last three plot types can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
|
||||
.. note:: 'line' and 'joint' can only be used with :ref:`Lattice <lattice>` and 'textedge' can only be used with :ref:`Stream <stream>`.
|
||||
|
||||
Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
||||
|
||||
|
|
@ -143,6 +144,23 @@ Finally, let's plot all line intersections present on the table's PDF page.
|
|||
:alt: A plot of all line intersections on a PDF page
|
||||
:align: left
|
||||
|
||||
textedge
|
||||
^^^^^^^^
|
||||
|
||||
You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_.
|
||||
|
||||
::
|
||||
|
||||
>>> camelot.plot(tables[0], kind='textedge')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/plot_textedge.png
|
||||
:height: 674
|
||||
:width: 1366
|
||||
:scale: 50%
|
||||
:alt: A plot of relevant textedges on a PDF page
|
||||
:align: left
|
||||
|
||||
Specify table areas
|
||||
-------------------
|
||||
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
|
|
@ -29,12 +29,20 @@ def test_grid_plot():
|
|||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_contour_plot():
|
||||
def test_lattice_contour_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], kind='contour')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_stream_contour_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='stream')
|
||||
return camelot.plot(tables[0], kind='contour')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_line_plot():
|
||||
|
|
@ -49,3 +57,11 @@ def test_joint_plot():
|
|||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], kind='joint')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_textedge_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='stream')
|
||||
return camelot.plot(tables[0], kind='textedge')
|
||||
|
|
|
|||
Loading…
Reference in New Issue