commit
50780e24f8
12
HISTORY.md
12
HISTORY.md
|
|
@ -4,6 +4,18 @@ Release History
|
||||||
master
|
master
|
||||||
------
|
------
|
||||||
|
|
||||||
|
**Improvements**
|
||||||
|
|
||||||
|
* [#207](https://github.com/socialcopsdev/camelot/issues/207) Add a plot type for Stream text edges and detected table areas. [#224](https://github.com/socialcopsdev/camelot/pull/224) by Vinayak Mehta.
|
||||||
|
|
||||||
|
**Bugfixes**
|
||||||
|
|
||||||
|
* [#217](https://github.com/socialcopsdev/camelot/issues/217) Fix IndexError when scale is large.
|
||||||
|
|
||||||
|
**Documentation**
|
||||||
|
|
||||||
|
* Add pdfplumber comparison and update Tabula (stream) comparison. Check out the [wiki page](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
||||||
|
|
||||||
0.4.1 (2018-12-05)
|
0.4.1 (2018-12-05)
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -138,7 +138,7 @@ def lattice(c, *args, **kwargs):
|
||||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||||
' used to combine text horizontally, to generate columns.')
|
' used to combine text horizontally, to generate columns.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'grid']),
|
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
help='Plot elements found on PDF page for visual debugging.')
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
@click.argument('filepath', type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
|
|
|
||||||
|
|
@ -341,6 +341,7 @@ class Lattice(BaseParser):
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = (self.image, self.table_bbox_unscaled)
|
table._image = (self.image, self.table_bbox_unscaled)
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
table._textedges = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -263,6 +263,7 @@ class Stream(BaseParser):
|
||||||
textedges.generate(textlines)
|
textedges.generate(textlines)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
relevant_textedges = textedges.get_relevant()
|
relevant_textedges = textedges.get_relevant()
|
||||||
|
self.textedges.extend(relevant_textedges)
|
||||||
# guess table areas using textlines and relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||||
# treat whole page as table area if no table areas found
|
# treat whole page as table area if no table areas found
|
||||||
|
|
@ -272,6 +273,7 @@ class Stream(BaseParser):
|
||||||
return table_bbox
|
return table_bbox
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
|
self.textedges = []
|
||||||
if self.table_areas is not None:
|
if self.table_areas is not None:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_areas:
|
for area in self.table_areas:
|
||||||
|
|
@ -378,6 +380,7 @@ class Stream(BaseParser):
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = None
|
table._image = None
|
||||||
table._segments = None
|
table._segments = None
|
||||||
|
table._textedges = self.textedges
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,10 @@ class PlotMethods(object):
|
||||||
if not _HAS_MPL:
|
if not _HAS_MPL:
|
||||||
raise ImportError('matplotlib is required for plotting.')
|
raise ImportError('matplotlib is required for plotting.')
|
||||||
|
|
||||||
if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']:
|
if table.flavor == 'lattice' and kind in ['textedge']:
|
||||||
|
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
|
||||||
|
kind))
|
||||||
|
elif table.flavor == 'stream' and kind in ['joint', 'line']:
|
||||||
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
||||||
kind))
|
kind))
|
||||||
|
|
||||||
|
|
@ -114,22 +117,84 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
img, table_bbox = table._image
|
img, table_bbox = table._image
|
||||||
|
_FOR_LATTICE = True
|
||||||
|
except TypeError:
|
||||||
|
img, table_bbox = (None, {table._bbox: None})
|
||||||
|
_FOR_LATTICE = False
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
|
||||||
|
xs, ys = [], []
|
||||||
|
if not _FOR_LATTICE:
|
||||||
|
for t in table._text:
|
||||||
|
xs.extend([t[0], t[2]])
|
||||||
|
ys.extend([t[1], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1],
|
||||||
|
color='blue'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
for t in table_bbox.keys():
|
for t in table_bbox.keys():
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]),
|
(t[0], t[1]),
|
||||||
t[2] - t[0],
|
t[2] - t[0],
|
||||||
t[3] - t[1],
|
t[3] - t[1],
|
||||||
fill=None,
|
fill=False,
|
||||||
edgecolor='red'
|
color='red'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
if not _FOR_LATTICE:
|
||||||
|
xs.extend([t[0], t[2]])
|
||||||
|
ys.extend([t[1], t[3]])
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
||||||
|
if _FOR_LATTICE:
|
||||||
ax.imshow(img)
|
ax.imshow(img)
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
def textedge(self, table):
|
||||||
|
"""Generates a plot for relevant textedges.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
|
"""
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
xs, ys = [], []
|
||||||
|
for t in table._text:
|
||||||
|
xs.extend([t[0], t[2]])
|
||||||
|
ys.extend([t[1], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1],
|
||||||
|
color='blue'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
||||||
|
for te in table._textedges:
|
||||||
|
ax.plot([te.x, te.x],
|
||||||
|
[te.y0, te.y1])
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
def joint(self, table):
|
def joint(self, table):
|
||||||
"""Generates a plot for all line intersections present
|
"""Generates a plot for all line intersections present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 68 KiB |
|
|
@ -41,8 +41,9 @@ You can specify the type of element you want to plot using the ``kind`` keyword
|
||||||
- 'contour'
|
- 'contour'
|
||||||
- 'line'
|
- 'line'
|
||||||
- 'joint'
|
- 'joint'
|
||||||
|
- 'textedge'
|
||||||
|
|
||||||
.. note:: The last three plot types can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
|
.. note:: 'line' and 'joint' can only be used with :ref:`Lattice <lattice>` and 'textedge' can only be used with :ref:`Stream <stream>`.
|
||||||
|
|
||||||
Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
||||||
|
|
||||||
|
|
@ -143,6 +144,23 @@ Finally, let's plot all line intersections present on the table's PDF page.
|
||||||
:alt: A plot of all line intersections on a PDF page
|
:alt: A plot of all line intersections on a PDF page
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
|
textedge
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
You can also visualize the textedges found on a page by specifying ``kind='textedge'``. To know more about what a "textedge" is, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> camelot.plot(tables[0], kind='textedge')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
.. figure:: ../_static/png/plot_textedge.png
|
||||||
|
:height: 674
|
||||||
|
:width: 1366
|
||||||
|
:scale: 50%
|
||||||
|
:alt: A plot of relevant textedges on a PDF page
|
||||||
|
:align: left
|
||||||
|
|
||||||
Specify table areas
|
Specify table areas
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
|
|
@ -29,12 +29,20 @@ def test_grid_plot():
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_contour_plot():
|
def test_lattice_contour_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='contour')
|
return camelot.plot(tables[0], kind='contour')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
|
def test_stream_contour_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor='stream')
|
||||||
|
return camelot.plot(tables[0], kind='contour')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_line_plot():
|
def test_line_plot():
|
||||||
|
|
@ -49,3 +57,11 @@ def test_joint_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='joint')
|
return camelot.plot(tables[0], kind='joint')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
|
def test_textedge_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor='stream')
|
||||||
|
return camelot.plot(tables[0], kind='textedge')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue