From 87a2f4fdc9590e5df90a813df5bf27d4b7eb983c Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 12 Dec 2018 07:36:07 +0530 Subject: [PATCH] Add textedge plot type --- camelot/cli.py | 2 +- camelot/parsers/lattice.py | 1 + camelot/parsers/stream.py | 3 ++ camelot/plotting.py | 75 +++++++++++++++++++++++++++++++++++--- 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index eaae955..1b995aa 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -138,7 +138,7 @@ def lattice(c, *args, **kwargs): @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', - type=click.Choice(['text', 'grid']), + type=click.Choice(['text', 'grid', 'contour', 'textedge']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 22c77e8..14d8f6c 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -341,6 +341,7 @@ class Lattice(BaseParser): table._text = _text table._image = (self.image, self.table_bbox_unscaled) table._segments = (self.vertical_segments, self.horizontal_segments) + table._textedges = None return table diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3b9c068..b6785df 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -263,6 +263,7 @@ class Stream(BaseParser): textedges.generate(textlines) # select relevant edges relevant_textedges = textedges.get_relevant() + self.textedges.extend(relevant_textedges) # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found @@ -272,6 +273,7 @@ class Stream(BaseParser): return table_bbox def _generate_table_bbox(self): + self.textedges = [] if self.table_areas is not None: table_bbox = {} for area in self.table_areas: @@ -378,6 +380,7 @@ class Stream(BaseParser): table._text = _text table._image = None table._segments = None + table._textedges = self.textedges return table diff --git a/camelot/plotting.py b/camelot/plotting.py index 3b91cee..1320267 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -33,7 +33,10 @@ class PlotMethods(object): if not _HAS_MPL: raise ImportError('matplotlib is required for plotting.') - if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']: + if table.flavor == 'lattice' and kind in ['textedge']: + raise NotImplementedError("Lattice flavor does not support kind='{}'".format( + kind)) + elif table.flavor == 'stream' and kind in ['joint', 'line']: raise NotImplementedError("Stream flavor does not support kind='{}'".format( kind)) @@ -114,20 +117,82 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - img, table_bbox = table._image + try: + img, table_bbox = table._image + _FOR_LATTICE = True + except TypeError: + img, table_bbox = (None, {table._bbox: None}) + _FOR_LATTICE = False fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') + + xs, ys = [], [] + if not _FOR_LATTICE: + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + color='blue' + ) + ) + for t in table_bbox.keys(): ax.add_patch( patches.Rectangle( (t[0], t[1]), t[2] - t[0], t[3] - t[1], - fill=None, - edgecolor='red' + fill=False, + color='red' ) ) - ax.imshow(img) + if not _FOR_LATTICE: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + if _FOR_LATTICE: + ax.imshow(img) + return fig + + def textedge(self, table): + """Generates a plot for relevant textedges. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + color='blue' + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + for te in table._textedges: + ax.plot([te.x, te.x], + [te.y0, te.y1]) + return fig def joint(self, table):