From 3f5af18738e269cffda41680eb95b87b89978f2b Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 20 Dec 2018 15:01:29 +0530 Subject: [PATCH] Add resolution --- camelot/cli.py | 2 ++ camelot/core.py | 11 ++++++----- camelot/parsers/lattice.py | 5 +++-- camelot/parsers/stream.py | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index b0832f4..6c1b933 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -84,6 +84,8 @@ def cli(ctx, *args, **kwargs): ' may be zero or negative as well.') @click.option('-I', '--iterations', default=0, help='Number of times for erosion/dilation will be applied.') +@click.option('-res', '--resolution', default=300, + help='Resolution used for PDF to PNG conversion.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), help='Plot elements found on PDF page for visual debugging.') diff --git a/camelot/core.py b/camelot/core.py index 99268e1..dc3bd88 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -72,7 +72,8 @@ class TextEdges(object): the PDF page. The dict has three keys based on the alignments, and each key's value is a list of camelot.core.TextEdge objects. """ - def __init__(self): + def __init__(self, edge_close_tol=50): + self.edge_close_tol = edge_close_tol self._textedges = {'left': [], 'right': [], 'middle': []} @staticmethod @@ -104,7 +105,7 @@ class TextEdges(object): te = TextEdge(x, y0, y1, align=align) self._textedges[align].append(te) - def update(self, textline, edge_close_tol=50): + def update(self, textline): """Updates an existing text edge in the current dict. """ for align in ['left', 'right', 'middle']: @@ -114,15 +115,15 @@ class TextEdges(object): self.add(textline, align) else: self._textedges[align][idx].update_coords( - x_coord, textline.y0, edge_close_tol=edge_close_tol) + x_coord, textline.y0, edge_close_tol=self.edge_close_tol) - def generate(self, textlines, edge_close_tol=50): + def generate(self, textlines): """Generates the text edges dict based on horizontal text rows. """ for tl in textlines: if len(tl.get_text().strip()) > 1: # TODO: hacky - self.update(tl, edge_close_tol=edge_close_tol) + self.update(tl) def get_relevant(self): """Returns the list of relevant text edges (all share the same diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 0ec53bd..cfbbcda 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -76,7 +76,7 @@ class Lattice(BaseParser): line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, **kwargs): + iterations=0, resolution=300, **kwargs): self.table_areas = table_areas self.process_background = process_background self.line_size_scaling = line_size_scaling @@ -89,6 +89,7 @@ class Lattice(BaseParser): self.threshold_blocksize = threshold_blocksize self.threshold_constant = threshold_constant self.iterations = iterations + self.resolution = resolution @staticmethod def _reduce_index(t, idx, shift_text): @@ -209,7 +210,7 @@ class Lattice(BaseParser): '-sDEVICE=png16m', '-o', self.imagename, - '-r600', + '-r{}'.format(self.resolution), self.filename ] gs = get_executable() diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 1f5a856..a7c5af4 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -255,9 +255,9 @@ class Stream(BaseParser): # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) - textedges = TextEdges() + textedges = TextEdges(edge_close_tol=self.edge_close_tol) # generate left, middle and right textedges - textedges.generate(textlines, edge_close_tol=self.edge_close_tol) + textedges.generate(textlines) # select relevant edges relevant_textedges = textedges.get_relevant() self.textedges.extend(relevant_textedges)