From 2115a0e177aa1116bad35ae720c9735a264fdcea Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 3 Jul 2019 22:04:19 +0530 Subject: [PATCH] Blacken code --- camelot/__init__.py | 10 +- camelot/__main__.py | 2 +- camelot/__version__.py | 22 +- camelot/cli.py | 341 ++++++++++++++++++---------- camelot/core.py | 201 +++++++++------- camelot/ext/ghostscript/__init__.py | 19 +- camelot/ext/ghostscript/_gsprint.py | 52 +++-- camelot/handlers.py | 77 ++++--- camelot/image_processing.py | 47 ++-- camelot/io.py | 26 ++- camelot/parsers/base.py | 10 +- camelot/parsers/lattice.py | 175 +++++++++----- camelot/parsers/stream.py | 117 ++++++---- camelot/plotting.py | 73 +++--- camelot/utils.py | 271 ++++++++++++++-------- 15 files changed, 892 insertions(+), 551 deletions(-) diff --git a/camelot/__init__.py b/camelot/__init__.py index 68815f2..6c1ca9e 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -9,8 +9,8 @@ from .io import read_pdf from .plotting import PlotMethods -def _write_usage(self, prog, args='', prefix='Usage: '): - return self._write_usage('camelot', args, prefix=prefix) +def _write_usage(self, prog, args="", prefix="Usage: "): + return self._write_usage("camelot", args, prefix=prefix) # monkey patch click.HelpFormatter @@ -18,10 +18,10 @@ HelpFormatter._write_usage = HelpFormatter.write_usage HelpFormatter.write_usage = _write_usage # set up logging -logger = logging.getLogger('camelot') +logger = logging.getLogger("camelot") -format_string = '%(asctime)s - %(levelname)s - %(message)s' -formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') +format_string = "%(asctime)s - %(levelname)s - %(message)s" +formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S") handler = logging.StreamHandler() handler.setFormatter(formatter) diff --git a/camelot/__main__.py b/camelot/__main__.py index c945051..93040c6 100755 --- a/camelot/__main__.py +++ b/camelot/__main__.py @@ -3,7 +3,7 @@ from __future__ import absolute_import -__all__ = ('main',) +__all__ = ("main",) def main(): diff --git a/camelot/__version__.py b/camelot/__version__.py index b0fba8f..3713048 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,23 +1,23 @@ # -*- coding: utf-8 -*- VERSION = (0, 7, 2) -PRERELEASE = None # alpha, beta or rc +PRERELEASE = None # alpha, beta or rc REVISION = None def generate_version(version, prerelease=None, revision=None): - version_parts = ['.'.join(map(str, version))] + version_parts = [".".join(map(str, version))] if prerelease is not None: - version_parts.append('-{}'.format(prerelease)) + version_parts.append("-{}".format(prerelease)) if revision is not None: - version_parts.append('.{}'.format(revision)) - return ''.join(version_parts) + version_parts.append(".{}".format(revision)) + return "".join(version_parts) -__title__ = 'camelot-py' -__description__ = 'PDF Table Extraction for Humans.' -__url__ = 'http://camelot-py.readthedocs.io/' +__title__ = "camelot-py" +__description__ = "PDF Table Extraction for Humans." +__url__ = "http://camelot-py.readthedocs.io/" __version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION) -__author__ = 'Vinayak Mehta' -__author_email__ = 'vmehta94@gmail.com' -__license__ = 'MIT License' +__author__ = "Vinayak Mehta" +__author_email__ = "vmehta94@gmail.com" +__license__ = "MIT License" diff --git a/camelot/cli.py b/camelot/cli.py index b661555..b46f657 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -3,6 +3,7 @@ import logging import click + try: import matplotlib.pyplot as plt except ImportError: @@ -13,7 +14,7 @@ else: from . import __version__, read_pdf, plot -logger = logging.getLogger('camelot') +logger = logging.getLogger("camelot") logger.setLevel(logging.INFO) @@ -30,23 +31,47 @@ pass_config = click.make_pass_decorator(Config) @click.group() @click.version_option(version=__version__) -@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.') -@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' - ' Example: 1,3,4 or 1,4-end or all.') -@click.option('-pw', '--password', help='Password for decryption.') -@click.option('-o', '--output', help='Output file path.') -@click.option('-f', '--format', - type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']), - help='Output file format.') -@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') -@click.option('-split', '--split_text', is_flag=True, - help='Split text that spans across multiple cells.') -@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' - ' font size. Useful to detect super/subscripts.') -@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before' - ' assigning it to a cell.') -@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), - help='PDFMiner char_margin, line_margin and word_margin.') +@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.") +@click.option( + "-p", + "--pages", + default="1", + help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.", +) +@click.option("-pw", "--password", help="Password for decryption.") +@click.option("-o", "--output", help="Output file path.") +@click.option( + "-f", + "--format", + type=click.Choice(["csv", "json", "excel", "html", "sqlite"]), + help="Output file format.", +) +@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.") +@click.option( + "-split", + "--split_text", + is_flag=True, + help="Split text that spans across multiple cells.", +) +@click.option( + "-flag", + "--flag_size", + is_flag=True, + help="Flag text based on" " font size. Useful to detect super/subscripts.", +) +@click.option( + "-strip", + "--strip_text", + help="Characters that should be stripped from a string before" + " assigning it to a cell.", +) +@click.option( + "-M", + "--margins", + nargs=3, + default=(1.0, 0.5, 0.1), + help="PDFMiner char_margin, line_margin and word_margin.", +) @click.pass_context def cli(ctx, *args, **kwargs): """Camelot: PDF Table Extraction for Humans""" @@ -55,79 +80,131 @@ def cli(ctx, *args, **kwargs): ctx.obj.set_config(key, value) -@cli.command('lattice') -@click.option('-R', '--table_regions', default=[], multiple=True, - help='Page regions to analyze. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-T', '--table_areas', default=[], multiple=True, - help='Table areas to process. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-back', '--process_background', is_flag=True, - help='Process background lines.') -@click.option('-scale', '--line_scale', default=15, - help='Line size scaling factor. The larger the value,' - ' the smaller the detected lines.') -@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), - multiple=True, help='Direction in which text in a spanning cell' - ' will be copied over.') -@click.option('-shift', '--shift_text', default=['l', 't'], - type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, - help='Direction in which text in a spanning cell will flow.') -@click.option('-l', '--line_tol', default=2, - help='Tolerance parameter used to merge close vertical' - ' and horizontal lines.') -@click.option('-j', '--joint_tol', default=2, - help='Tolerance parameter used to decide whether' - ' the detected lines and points lie close to each other.') -@click.option('-block', '--threshold_blocksize', default=15, - help='For adaptive thresholding, size of a pixel' - ' neighborhood that is used to calculate a threshold value for' - ' the pixel. Example: 3, 5, 7, and so on.') -@click.option('-const', '--threshold_constant', default=-2, - help='For adaptive thresholding, constant subtracted' - ' from the mean or weighted mean. Normally, it is positive but' - ' may be zero or negative as well.') -@click.option('-I', '--iterations', default=0, - help='Number of times for erosion/dilation will be applied.') -@click.option('-res', '--resolution', default=300, - help='Resolution used for PDF to PNG conversion.') -@click.option('-plot', '--plot_type', - type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), - help='Plot elements found on PDF page for visual debugging.') -@click.argument('filepath', type=click.Path(exists=True)) +@cli.command("lattice") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-back", "--process_background", is_flag=True, help="Process background lines." +) +@click.option( + "-scale", + "--line_scale", + default=15, + help="Line size scaling factor. The larger the value," + " the smaller the detected lines.", +) +@click.option( + "-copy", + "--copy_text", + default=[], + type=click.Choice(["h", "v"]), + multiple=True, + help="Direction in which text in a spanning cell" " will be copied over.", +) +@click.option( + "-shift", + "--shift_text", + default=["l", "t"], + type=click.Choice(["", "l", "r", "t", "b"]), + multiple=True, + help="Direction in which text in a spanning cell will flow.", +) +@click.option( + "-l", + "--line_tol", + default=2, + help="Tolerance parameter used to merge close vertical" " and horizontal lines.", +) +@click.option( + "-j", + "--joint_tol", + default=2, + help="Tolerance parameter used to decide whether" + " the detected lines and points lie close to each other.", +) +@click.option( + "-block", + "--threshold_blocksize", + default=15, + help="For adaptive thresholding, size of a pixel" + " neighborhood that is used to calculate a threshold value for" + " the pixel. Example: 3, 5, 7, and so on.", +) +@click.option( + "-const", + "--threshold_constant", + default=-2, + help="For adaptive thresholding, constant subtracted" + " from the mean or weighted mean. Normally, it is positive but" + " may be zero or negative as well.", +) +@click.option( + "-I", + "--iterations", + default=0, + help="Number of times for erosion/dilation will be applied.", +) +@click.option( + "-res", + "--resolution", + default=300, + help="Resolution used for PDF to PNG conversion.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "joint", "line"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) @pass_config def lattice(c, *args, **kwargs): """Use lines between text to parse the table.""" conf = c.config - pages = conf.pop('pages') - output = conf.pop('output') - f = conf.pop('format') - compress = conf.pop('zip') - quiet = conf.pop('quiet') - plot_type = kwargs.pop('plot_type') - filepath = kwargs.pop('filepath') + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") kwargs.update(conf) - table_regions = list(kwargs['table_regions']) - kwargs['table_regions'] = None if not table_regions else table_regions - table_areas = list(kwargs['table_areas']) - kwargs['table_areas'] = None if not table_areas else table_areas - copy_text = list(kwargs['copy_text']) - kwargs['copy_text'] = None if not copy_text else copy_text - kwargs['shift_text'] = list(kwargs['shift_text']) + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + copy_text = list(kwargs["copy_text"]) + kwargs["copy_text"] = None if not copy_text else copy_text + kwargs["shift_text"] = list(kwargs["shift_text"]) if plot_type is not None: if not _HAS_MPL: - raise ImportError('matplotlib is required for plotting.') + raise ImportError("matplotlib is required for plotting.") else: if output is None: - raise click.UsageError('Please specify output file path using --output') + raise click.UsageError("Please specify output file path using --output") if f is None: - raise click.UsageError('Please specify output file format using --format') + raise click.UsageError("Please specify output file format using --format") - tables = read_pdf(filepath, pages=pages, flavor='lattice', - suppress_stdout=quiet, **kwargs) - click.echo('Found {} tables'.format(tables.n)) + tables = read_pdf( + filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs + ) + click.echo("Found {} tables".format(tables.n)) if plot_type is not None: for table in tables: plot(table, kind=plot_type) @@ -136,57 +213,89 @@ def lattice(c, *args, **kwargs): tables.export(output, f=f, compress=compress) -@cli.command('stream') -@click.option('-R', '--table_regions', default=[], multiple=True, - help='Page regions to analyze. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-T', '--table_areas', default=[], multiple=True, - help='Table areas to process. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-C', '--columns', default=[], multiple=True, - help='X coordinates of column separators.') -@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter' - ' for extending textedges vertically.') -@click.option('-r', '--row_tol', default=2, help='Tolerance parameter' - ' used to combine text vertically, to generate rows.') -@click.option('-c', '--column_tol', default=0, help='Tolerance parameter' - ' used to combine text horizontally, to generate columns.') -@click.option('-plot', '--plot_type', - type=click.Choice(['text', 'grid', 'contour', 'textedge']), - help='Plot elements found on PDF page for visual debugging.') -@click.argument('filepath', type=click.Path(exists=True)) +@cli.command("stream") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-C", + "--columns", + default=[], + multiple=True, + help="X coordinates of column separators.", +) +@click.option( + "-e", + "--edge_tol", + default=50, + help="Tolerance parameter" " for extending textedges vertically.", +) +@click.option( + "-r", + "--row_tol", + default=2, + help="Tolerance parameter" " used to combine text vertically, to generate rows.", +) +@click.option( + "-c", + "--column_tol", + default=0, + help="Tolerance parameter" + " used to combine text horizontally, to generate columns.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "textedge"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) @pass_config def stream(c, *args, **kwargs): """Use spaces between text to parse the table.""" conf = c.config - pages = conf.pop('pages') - output = conf.pop('output') - f = conf.pop('format') - compress = conf.pop('zip') - quiet = conf.pop('quiet') - plot_type = kwargs.pop('plot_type') - filepath = kwargs.pop('filepath') + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") kwargs.update(conf) - table_regions = list(kwargs['table_regions']) - kwargs['table_regions'] = None if not table_regions else table_regions - table_areas = list(kwargs['table_areas']) - kwargs['table_areas'] = None if not table_areas else table_areas - columns = list(kwargs['columns']) - kwargs['columns'] = None if not columns else columns + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + columns = list(kwargs["columns"]) + kwargs["columns"] = None if not columns else columns if plot_type is not None: if not _HAS_MPL: - raise ImportError('matplotlib is required for plotting.') + raise ImportError("matplotlib is required for plotting.") else: if output is None: - raise click.UsageError('Please specify output file path using --output') + raise click.UsageError("Please specify output file path using --output") if f is None: - raise click.UsageError('Please specify output file format using --format') + raise click.UsageError("Please specify output file format using --format") - tables = read_pdf(filepath, pages=pages, flavor='stream', - suppress_stdout=quiet, **kwargs) - click.echo('Found {} tables'.format(tables.n)) + tables = read_pdf( + filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs + ) + click.echo("Found {} tables".format(tables.n)) if plot_type is not None: for table in tables: plot(table, kind=plot_type) diff --git a/camelot/core.py b/camelot/core.py index 63425cc..b7a02b1 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -42,7 +42,8 @@ class TextEdge(object): TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. """ - def __init__(self, x, y0, y1, align='left'): + + def __init__(self, x, y0, y1, align="left"): self.x = x self.y0 = y0 self.y1 = y1 @@ -51,8 +52,13 @@ class TextEdge(object): self.is_valid = False def __repr__(self): - return ''.format( - round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) + return "".format( + round(self.x, 2), + round(self.y0, 2), + round(self.y1, 2), + self.align, + self.is_valid, + ) def update_coords(self, x, y0, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets @@ -73,9 +79,10 @@ class TextEdges(object): the PDF page. The dict has three keys based on the alignments, and each key's value is a list of camelot.core.TextEdge objects. """ + def __init__(self, edge_tol=50): self.edge_tol = edge_tol - self._textedges = {'left': [], 'right': [], 'middle': []} + self._textedges = {"left": [], "right": [], "middle": []} @staticmethod def get_x_coord(textline, align): @@ -85,7 +92,7 @@ class TextEdges(object): x_left = textline.x0 x_right = textline.x1 x_middle = x_left + (x_right - x_left) / 2.0 - x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} + x_coord = {"left": x_left, "middle": x_middle, "right": x_right} return x_coord[align] def find(self, x_coord, align): @@ -109,21 +116,22 @@ class TextEdges(object): def update(self, textline): """Updates an existing text edge in the current dict. """ - for align in ['left', 'right', 'middle']: + for align in ["left", "right", "middle"]: x_coord = self.get_x_coord(textline, align) idx = self.find(x_coord, align) if idx is None: self.add(textline, align) else: self._textedges[align][idx].update_coords( - x_coord, textline.y0, edge_tol=self.edge_tol) + x_coord, textline.y0, edge_tol=self.edge_tol + ) def generate(self, textlines): """Generates the text edges dict based on horizontal text rows. """ for tl in textlines: - if len(tl.get_text().strip()) > 1: # TODO: hacky + if len(tl.get_text().strip()) > 1: # TODO: hacky self.update(tl) def get_relevant(self): @@ -132,9 +140,15 @@ class TextEdges(object): the most. """ intersections_sum = { - 'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid), - 'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid), - 'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid) + "left": sum( + te.intersections for te in self._textedges["left"] if te.is_valid + ), + "right": sum( + te.intersections for te in self._textedges["right"] if te.is_valid + ), + "middle": sum( + te.intersections for te in self._textedges["middle"] if te.is_valid + ), } # TODO: naive @@ -147,6 +161,7 @@ class TextEdges(object): """Returns a dict of interesting table areas on the PDF page calculated using relevant text edges. """ + def pad(area, average_row_height): x0 = area[0] - TABLE_AREA_PADDING y0 = area[1] - TABLE_AREA_PADDING @@ -175,7 +190,11 @@ class TextEdges(object): else: table_areas.pop(found) updated_area = ( - found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1)) + found[0], + min(te.y0, found[1]), + max(found[2], te.x), + max(found[3], te.y1), + ) table_areas[updated_area] = None # extend table areas based on textlines that overlap @@ -196,7 +215,11 @@ class TextEdges(object): if found is not None: table_areas.pop(found) updated_area = ( - min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) + min(tl.x0, found[0]), + min(tl.y0, found[1]), + max(found[2], tl.x1), + max(found[3], tl.y1), + ) table_areas[updated_area] = None average_textline_height = sum_textline_height / float(len(textlines)) @@ -265,11 +288,12 @@ class Cell(object): self.bottom = False self.hspan = False self.vspan = False - self._text = '' + self._text = "" def __repr__(self): - return ''.format( - round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)) + return "".format( + round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2) + ) @property def text(self): @@ -277,7 +301,7 @@ class Cell(object): @text.setter def text(self, t): - self._text = ''.join([self._text, t]) + self._text = "".join([self._text, t]) @property def bound(self): @@ -314,11 +338,11 @@ class Table(object): PDF page number. """ + def __init__(self, cols, rows): self.cols = cols self.rows = rows - self.cells = [[Cell(c[0], r[1], c[1], r[0]) - for c in cols] for r in rows] + self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] self.df = None self.shape = (0, 0) self.accuracy = 0 @@ -327,7 +351,7 @@ class Table(object): self.page = None def __repr__(self): - return '<{} shape={}>'.format(self.__class__.__name__, self.shape) + return "<{} shape={}>".format(self.__class__.__name__, self.shape) def __lt__(self, other): if self.page == other.page: @@ -352,10 +376,10 @@ class Table(object): """ # pretty? report = { - 'accuracy': round(self.accuracy, 2), - 'whitespace': round(self.whitespace, 2), - 'order': self.order, - 'page': self.page + "accuracy": round(self.accuracy, 2), + "whitespace": round(self.whitespace, 2), + "order": self.order, + "page": self.page, } return report @@ -383,12 +407,21 @@ class Table(object): for v in vertical: # find closest x coord # iterate over y coords and find closest start and end points - i = [i for i, t in enumerate(self.cols) - if np.isclose(v[0], t[0], atol=joint_tol)] - j = [j for j, t in enumerate(self.rows) - if np.isclose(v[3], t[0], atol=joint_tol)] - k = [k for k, t in enumerate(self.rows) - if np.isclose(v[1], t[0], atol=joint_tol)] + i = [ + i + for i, t in enumerate(self.cols) + if np.isclose(v[0], t[0], atol=joint_tol) + ] + j = [ + j + for j, t in enumerate(self.rows) + if np.isclose(v[3], t[0], atol=joint_tol) + ] + k = [ + k + for k, t in enumerate(self.rows) + if np.isclose(v[1], t[0], atol=joint_tol) + ] if not j: continue J = j[0] @@ -434,12 +467,21 @@ class Table(object): for h in horizontal: # find closest y coord # iterate over x coords and find closest start and end points - i = [i for i, t in enumerate(self.rows) - if np.isclose(h[1], t[0], atol=joint_tol)] - j = [j for j, t in enumerate(self.cols) - if np.isclose(h[0], t[0], atol=joint_tol)] - k = [k for k, t in enumerate(self.cols) - if np.isclose(h[2], t[0], atol=joint_tol)] + i = [ + i + for i, t in enumerate(self.rows) + if np.isclose(h[1], t[0], atol=joint_tol) + ] + j = [ + j + for j, t in enumerate(self.cols) + if np.isclose(h[0], t[0], atol=joint_tol) + ] + k = [ + k + for k, t in enumerate(self.cols) + if np.isclose(h[2], t[0], atol=joint_tol) + ] if not j: continue J = j[0] @@ -537,12 +579,7 @@ class Table(object): Output filepath. """ - kw = { - 'encoding': 'utf-8', - 'index': False, - 'header': False, - 'quoting': 1 - } + kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} kw.update(kwargs) self.df.to_csv(path, **kw) @@ -557,12 +594,10 @@ class Table(object): Output filepath. """ - kw = { - 'orient': 'records' - } + kw = {"orient": "records"} kw.update(kwargs) json_string = self.df.to_json(**kw) - with open(path, 'w') as f: + with open(path, "w") as f: f.write(json_string) def to_excel(self, path, **kwargs): @@ -577,8 +612,8 @@ class Table(object): """ kw = { - 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), - 'encoding': 'utf-8' + "sheet_name": "page-{}-table-{}".format(self.page, self.order), + "encoding": "utf-8", } kw.update(kwargs) writer = pd.ExcelWriter(path) @@ -597,7 +632,7 @@ class Table(object): """ html_string = self.df.to_html(**kwargs) - with open(path, 'w') as f: + with open(path, "w") as f: f.write(html_string) def to_sqlite(self, path, **kwargs): @@ -611,13 +646,10 @@ class Table(object): Output filepath. """ - kw = { - 'if_exists': 'replace', - 'index': False - } + kw = {"if_exists": "replace", "index": False} kw.update(kwargs) conn = sqlite3.connect(path) - table_name = 'page-{}-table-{}'.format(self.page, self.order) + table_name = "page-{}-table-{}".format(self.page, self.order) self.df.to_sql(table_name, conn, **kw) conn.commit() conn.close() @@ -633,12 +665,12 @@ class TableList(object): Number of tables in the list. """ + def __init__(self, tables): self._tables = tables def __repr__(self): - return '<{} n={}>'.format( - self.__class__.__name__, self.n) + return "<{} n={}>".format(self.__class__.__name__, self.n) def __len__(self): return len(self._tables) @@ -648,37 +680,39 @@ class TableList(object): @staticmethod def _format_func(table, f): - return getattr(table, 'to_{}'.format(f)) + return getattr(table, "to_{}".format(f)) @property def n(self): return len(self) def _write_file(self, f=None, **kwargs): - dirname = kwargs.get('dirname') - root = kwargs.get('root') - ext = kwargs.get('ext') + dirname = kwargs.get("dirname") + root = kwargs.get("root") + ext = kwargs.get("ext") for table in self._tables: - filename = os.path.join('{}-page-{}-table-{}{}'.format( - root, table.page, table.order, ext)) + filename = os.path.join( + "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + ) filepath = os.path.join(dirname, filename) to_format = self._format_func(table, f) to_format(filepath) def _compress_dir(self, **kwargs): - path = kwargs.get('path') - dirname = kwargs.get('dirname') - root = kwargs.get('root') - ext = kwargs.get('ext') - zipname = os.path.join(os.path.dirname(path), root) + '.zip' - with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + path = kwargs.get("path") + dirname = kwargs.get("dirname") + root = kwargs.get("root") + ext = kwargs.get("ext") + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: for table in self._tables: - filename = os.path.join('{}-page-{}-table-{}{}'.format( - root, table.page, table.order, ext)) + filename = os.path.join( + "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + ) filepath = os.path.join(dirname, filename) z.write(filepath, os.path.basename(filepath)) - def export(self, path, f='csv', compress=False): + def export(self, path, f="csv", compress=False): """Exports the list of tables to specified file format. Parameters @@ -697,33 +731,28 @@ class TableList(object): if compress: dirname = tempfile.mkdtemp() - kwargs = { - 'path': path, - 'dirname': dirname, - 'root': root, - 'ext': ext - } + kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext} - if f in ['csv', 'json', 'html']: + if f in ["csv", "json", "html"]: self._write_file(f=f, **kwargs) if compress: self._compress_dir(**kwargs) - elif f == 'excel': + elif f == "excel": filepath = os.path.join(dirname, basename) writer = pd.ExcelWriter(filepath) for table in self._tables: - sheet_name = 'page-{}-table-{}'.format(table.page, table.order) - table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8') + sheet_name = "page-{}-table-{}".format(table.page, table.order) + table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") writer.save() if compress: - zipname = os.path.join(os.path.dirname(path), root) + '.zip' - with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: z.write(filepath, os.path.basename(filepath)) - elif f == 'sqlite': + elif f == "sqlite": filepath = os.path.join(dirname, basename) for table in self._tables: table.to_sqlite(filepath) if compress: - zipname = os.path.join(os.path.dirname(path), root) + '.zip' - with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: z.write(filepath, os.path.basename(filepath)) diff --git a/camelot/ext/ghostscript/__init__.py b/camelot/ext/ghostscript/__init__.py index 2751ef7..1b4ec48 100644 --- a/camelot/ext/ghostscript/__init__.py +++ b/camelot/ext/ghostscript/__init__.py @@ -24,10 +24,10 @@ ghostscript - A Python interface for the Ghostscript interpreter C-API from . import _gsprint as gs -__author__ = 'Hartmut Goebel ' -__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel ' -__license__ = 'GNU General Public License version 3 (GPL v3)' -__version__ = '0.6' +__author__ = "Hartmut Goebel " +__copyright__ = "Copyright 2010-2018 by Hartmut Goebel " +__license__ = "GNU General Public License version 3 (GPL v3)" +__version__ = "0.6" class __Ghostscript(object): @@ -87,10 +87,13 @@ def Ghostscript(*args, **kwargs): # Ghostscript only supports a single instance if __instance__ is None: __instance__ = gs.new_instance() - return __Ghostscript(__instance__, args, - stdin=kwargs.get('stdin', None), - stdout=kwargs.get('stdout', None), - stderr=kwargs.get('stderr', None)) + return __Ghostscript( + __instance__, + args, + stdin=kwargs.get("stdin", None), + stdout=kwargs.get("stdout", None), + stderr=kwargs.get("stderr", None), + ) __instance__ = None diff --git a/camelot/ext/ghostscript/_gsprint.py b/camelot/ext/ghostscript/_gsprint.py index b30165c..efc6be7 100644 --- a/camelot/ext/ghostscript/_gsprint.py +++ b/camelot/ext/ghostscript/_gsprint.py @@ -42,10 +42,10 @@ e_Info = -110 # e_Quit = -101 -__author__ = 'Hartmut Goebel ' -__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel ' -__license__ = 'GNU General Public License version 3 (GPL v3)' -__version__ = '0.6' +__author__ = "Hartmut Goebel " +__copyright__ = "Copyright 2010-2018 by Hartmut Goebel " +__license__ = "GNU General Public License version 3 (GPL v3)" +__version__ = "0.6" gs_main_instance = c_void_p display_callback = c_void_p @@ -55,7 +55,7 @@ display_callback = c_void_p class GhostscriptError(Exception): def __init__(self, ecode): - self.code = ecode + self.code = ecode def new_instance(): @@ -89,6 +89,7 @@ def _wrap_stdin(infp): """Wrap a filehandle into a C function to be used as `stdin` callback for ``set_stdio``. The filehandle has to support the readline() method. """ + def _wrap(instance, dest, count): try: data = infp.readline(count) @@ -110,6 +111,7 @@ def _wrap_stdout(outfp): `stderr` callback for ``set_stdio``. The filehandle has to support the write() and flush() methods. """ + def _wrap(instance, str, count): outfp.write(str[:count]) outfp.flush() @@ -187,11 +189,23 @@ def __win32_finddll(): import winreg except ImportError: # assume Python 2 - from _winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \ - QueryInfoKey, HKEY_LOCAL_MACHINE + from _winreg import ( + OpenKey, + CloseKey, + EnumKey, + QueryValueEx, + QueryInfoKey, + HKEY_LOCAL_MACHINE, + ) else: - from winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \ - QueryInfoKey, HKEY_LOCAL_MACHINE + from winreg import ( + OpenKey, + CloseKey, + EnumKey, + QueryValueEx, + QueryInfoKey, + HKEY_LOCAL_MACHINE, + ) from distutils.version import LooseVersion import os @@ -199,15 +213,19 @@ def __win32_finddll(): dlls = [] # Look up different variants of Ghostscript and take the highest # version for which the DLL is to be found in the filesystem. - for key_name in ('AFPL Ghostscript', 'Aladdin Ghostscript', - 'GNU Ghostscript', 'GPL Ghostscript'): + for key_name in ( + "AFPL Ghostscript", + "Aladdin Ghostscript", + "GNU Ghostscript", + "GPL Ghostscript", + ): try: k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name) for num in range(0, QueryInfoKey(k1)[0]): version = EnumKey(k1, num) try: k2 = OpenKey(k1, version) - dll_path = QueryValueEx(k2, 'GS_DLL')[0] + dll_path = QueryValueEx(k2, "GS_DLL")[0] CloseKey(k2) if os.path.exists(dll_path): dlls.append((LooseVersion(version), dll_path)) @@ -223,21 +241,21 @@ def __win32_finddll(): return None -if sys.platform == 'win32': +if sys.platform == "win32": libgs = __win32_finddll() if not libgs: - raise RuntimeError('Please make sure that Ghostscript is installed') + raise RuntimeError("Please make sure that Ghostscript is installed") libgs = windll.LoadLibrary(libgs) else: try: - libgs = cdll.LoadLibrary('libgs.so') + libgs = cdll.LoadLibrary("libgs.so") except OSError: # shared object file not found import ctypes.util - libgs = ctypes.util.find_library('gs') + libgs = ctypes.util.find_library("gs") if not libgs: - raise RuntimeError('Please make sure that Ghostscript is installed') + raise RuntimeError("Please make sure that Ghostscript is installed") libgs = cdll.LoadLibrary(libgs) del __win32_finddll diff --git a/camelot/handlers.py b/camelot/handlers.py index d773e4a..3a6d663 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -7,8 +7,14 @@ from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice -from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, - get_rotation, is_url, download_url) +from .utils import ( + TemporaryDirectory, + get_page_layout, + get_text_objects, + get_rotation, + is_url, + download_url, +) class PDFHandler(object): @@ -27,19 +33,20 @@ class PDFHandler(object): Password for decryption. """ - def __init__(self, filepath, pages='1', password=None): + + def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath - if not filepath.lower().endswith('.pdf'): + if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: - self.password = '' + self.password = "" else: self.password = password if sys.version_info[0] < 3: - self.password = self.password.encode('ascii') + self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): @@ -60,26 +67,26 @@ class PDFHandler(object): """ page_numbers = [] - if pages == '1': - page_numbers.append({'start': 1, 'end': 1}) + if pages == "1": + page_numbers.append({"start": 1, "end": 1}) else: - infile = PdfFileReader(open(filepath, 'rb'), strict=False) + infile = PdfFileReader(open(filepath, "rb"), strict=False) if infile.isEncrypted: infile.decrypt(self.password) - if pages == 'all': - page_numbers.append({'start': 1, 'end': infile.getNumPages()}) + if pages == "all": + page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: - for r in pages.split(','): - if '-' in r: - a, b = r.split('-') - if b == 'end': + for r in pages.split(","): + if "-" in r: + a, b = r.split("-") + if b == "end": b = infile.getNumPages() - page_numbers.append({'start': int(a), 'end': int(b)}) + page_numbers.append({"start": int(a), "end": int(b)}) else: - page_numbers.append({'start': int(r), 'end': int(r)}) + page_numbers.append({"start": int(r), "end": int(r)}) P = [] for p in page_numbers: - P.extend(range(p['start'], p['end'] + 1)) + P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): @@ -95,16 +102,16 @@ class PDFHandler(object): Tmp directory. """ - with open(filepath, 'rb') as fileobj: + with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) - fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) + fpath = os.path.join(temp, "page-{0}.pdf".format(page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) - with open(fpath, 'wb') as f: + with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF @@ -112,23 +119,25 @@ class PDFHandler(object): horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) - if rotation != '': - fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) + if rotation != "": + fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) - infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + infile = PdfFileReader(open(fpath_new, "rb"), strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) - if rotation == 'anticlockwise': + if rotation == "anticlockwise": p.rotateClockwise(90) - elif rotation == 'clockwise': + elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) - with open(fpath, 'wb') as f: + with open(fpath, "wb") as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): + def parse( + self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs + ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -154,11 +163,13 @@ class PDFHandler(object): with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) - pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) - for p in self.pages] - parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) + pages = [ + os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages + ] + parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout, - layout_kwargs=layout_kwargs) + t = parser.extract_tables( + p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + ) tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 3051852..3753a0c 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -39,17 +39,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): if process_background: threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, blocksize, c) + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c + ) else: threshold = cv2.adaptiveThreshold( - np.invert(gray), 255, - cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) + np.invert(gray), + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blocksize, + c, + ) return img, threshold -def find_lines(threshold, regions=None, direction='horizontal', - line_scale=15, iterations=0): +def find_lines( + threshold, regions=None, direction="horizontal", line_scale=15, iterations=0 +): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -87,15 +93,14 @@ def find_lines(threshold, regions=None, direction='horizontal', """ lines = [] - if direction == 'vertical': + if direction == "vertical": size = threshold.shape[0] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) - elif direction == 'horizontal': + elif direction == "horizontal": size = threshold.shape[1] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: - raise ValueError("Specify direction as either 'vertical' or" - " 'horizontal'") + raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'") if regions is not None: region_mask = np.zeros(threshold.shape) @@ -110,19 +115,21 @@ def find_lines(threshold, regions=None, direction='horizontal', try: _, contours, _ = cv2.findContours( - threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) except ValueError: # for opencv backward compatibility contours, _ = cv2.findContours( - threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) for c in contours: x, y, w, h = cv2.boundingRect(c) x1, x2 = x, x + w y1, y2 = y, y + h - if direction == 'vertical': + if direction == "vertical": lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1)) - elif direction == 'horizontal': + elif direction == "horizontal": lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2)) return dmask, lines @@ -150,11 +157,13 @@ def find_contours(vertical, horizontal): try: __, contours, __ = cv2.findContours( - mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) except ValueError: # for opencv backward compatibility contours, __ = cv2.findContours( - mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) # sort in reverse based on contour area and use first 10 contours contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] @@ -196,11 +205,13 @@ def find_joints(contours, vertical, horizontal): roi = joints[y : y + h, x : x + w] try: __, jc, __ = cv2.findContours( - roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE + ) except ValueError: # for opencv backward compatibility jc, __ = cv2.findContours( - roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE + ) if len(jc) <= 4: # remove contours with less than 4 joints continue joint_coords = [] diff --git a/camelot/io.py b/camelot/io.py index 5162dd2..a27a7c6 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,8 +6,15 @@ from .handlers import PDFHandler from .utils import validate_input, remove_extra -def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, layout_kwargs={}, **kwargs): +def read_pdf( + filepath, + pages="1", + password=None, + flavor="lattice", + suppress_stdout=False, + layout_kwargs={}, + **kwargs +): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -91,9 +98,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', tables : camelot.core.TableList """ - if flavor not in ['lattice', 'stream']: - raise NotImplementedError("Unknown flavor specified." - " Use either 'lattice' or 'stream'") + if flavor not in ["lattice", "stream"]: + raise NotImplementedError( + "Unknown flavor specified." " Use either 'lattice' or 'stream'" + ) with warnings.catch_warnings(): if suppress_stdout: @@ -102,6 +110,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) - tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, - layout_kwargs=layout_kwargs, **kwargs) + tables = p.parse( + flavor=flavor, + suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs, + **kwargs + ) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index a20cd5e..cb1bc21 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,13 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ + def _generate_layout(self, filename, layout_kwargs): self.filename = filename self.layout_kwargs = layout_kwargs - self.layout, self.dimensions = get_page_layout( - filename, **layout_kwargs) - self.images = get_text_objects(self.layout, ltype='image') - self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text') - self.vertical_text = get_text_objects(self.layout, ltype='vertical_text') + self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) + self.images = get_text_objects(self.layout, ltype="image") + self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") + self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 6d7dbd0..197ff9f 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -14,14 +14,25 @@ import pandas as pd from .base import BaseParser from ..core import Table -from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, - merge_close_lines, get_table_index, compute_accuracy, - compute_whitespace) -from ..image_processing import (adaptive_threshold, find_lines, - find_contours, find_joints) +from ..utils import ( + scale_image, + scale_pdf, + segments_in_bbox, + text_in_bbox, + merge_close_lines, + get_table_index, + compute_accuracy, + compute_whitespace, +) +from ..image_processing import ( + adaptive_threshold, + find_lines, + find_contours, + find_joints, +) -logger = logging.getLogger('camelot') +logger = logging.getLogger("camelot") class Lattice(BaseParser): @@ -83,11 +94,26 @@ class Lattice(BaseParser): Resolution used for PDF to PNG conversion. """ - def __init__(self, table_regions=None, table_areas=None, process_background=False, - line_scale=15, copy_text=None, shift_text=['l', 't'], - split_text=False, flag_size=False, strip_text='', line_tol=2, - joint_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, resolution=300, **kwargs): + + def __init__( + self, + table_regions=None, + table_areas=None, + process_background=False, + line_scale=15, + copy_text=None, + shift_text=["l", "t"], + split_text=False, + flag_size=False, + strip_text="", + line_tol=2, + joint_tol=2, + threshold_blocksize=15, + threshold_constant=-2, + iterations=0, + resolution=300, + **kwargs + ): self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background @@ -130,19 +156,19 @@ class Lattice(BaseParser): indices = [] for r_idx, c_idx, text in idx: for d in shift_text: - if d == 'l': + if d == "l": if t.cells[r_idx][c_idx].hspan: while not t.cells[r_idx][c_idx].left: c_idx -= 1 - if d == 'r': + if d == "r": if t.cells[r_idx][c_idx].hspan: while not t.cells[r_idx][c_idx].right: c_idx += 1 - if d == 't': + if d == "t": if t.cells[r_idx][c_idx].vspan: while not t.cells[r_idx][c_idx].top: r_idx -= 1 - if d == 'b': + if d == "b": if t.cells[r_idx][c_idx].vspan: while not t.cells[r_idx][c_idx].bottom: r_idx += 1 @@ -171,13 +197,13 @@ class Lattice(BaseParser): if f == "h": for i in range(len(t.cells)): for j in range(len(t.cells[i])): - if t.cells[i][j].text.strip() == '': + if t.cells[i][j].text.strip() == "": if t.cells[i][j].hspan and not t.cells[i][j].left: t.cells[i][j].text = t.cells[i][j - 1].text elif f == "v": for i in range(len(t.cells)): for j in range(len(t.cells[i])): - if t.cells[i][j].text.strip() == '': + if t.cells[i][j].text.strip() == "": if t.cells[i][j].vspan and not t.cells[i][j].top: t.cells[i][j].text = t.cells[i - 1][j].text return t @@ -185,11 +211,12 @@ class Lattice(BaseParser): def _generate_image(self): from ..ext.ghostscript import Ghostscript - self.imagename = ''.join([self.rootname, '.png']) - gs_call = '-q -sDEVICE=png16m -o {} -r300 {}'.format( - self.imagename, self.filename) + self.imagename = "".join([self.rootname, ".png"]) + gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( + self.imagename, self.filename + ) gs_call = gs_call.encode().split() - null = open(os.devnull, 'wb') + null = open(os.devnull, "wb") with Ghostscript(*gs_call, stdout=null) as gs: pass null.close() @@ -208,8 +235,11 @@ class Lattice(BaseParser): return scaled_areas self.image, self.threshold = adaptive_threshold( - self.imagename, process_background=self.process_background, - blocksize=self.threshold_blocksize, c=self.threshold_constant) + self.imagename, + process_background=self.process_background, + blocksize=self.threshold_blocksize, + c=self.threshold_constant, + ) image_width = self.image.shape[1] image_height = self.image.shape[0] @@ -226,21 +256,35 @@ class Lattice(BaseParser): regions = scale_areas(self.table_regions) vertical_mask, vertical_segments = find_lines( - self.threshold, regions=regions, direction='vertical', - line_scale=self.line_scale, iterations=self.iterations) + self.threshold, + regions=regions, + direction="vertical", + line_scale=self.line_scale, + iterations=self.iterations, + ) horizontal_mask, horizontal_segments = find_lines( - self.threshold, regions=regions, direction='horizontal', - line_scale=self.line_scale, iterations=self.iterations) + self.threshold, + regions=regions, + direction="horizontal", + line_scale=self.line_scale, + iterations=self.iterations, + ) contours = find_contours(vertical_mask, horizontal_mask) table_bbox = find_joints(contours, vertical_mask, horizontal_mask) else: vertical_mask, vertical_segments = find_lines( - self.threshold, direction='vertical', line_scale=self.line_scale, - iterations=self.iterations) + self.threshold, + direction="vertical", + line_scale=self.line_scale, + iterations=self.iterations, + ) horizontal_mask, horizontal_segments = find_lines( - self.threshold, direction='horizontal', line_scale=self.line_scale, - iterations=self.iterations) + self.threshold, + direction="horizontal", + line_scale=self.line_scale, + iterations=self.iterations, + ) areas = scale_areas(self.table_areas) table_bbox = find_joints(areas, vertical_mask, horizontal_mask) @@ -248,18 +292,20 @@ class Lattice(BaseParser): self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( - table_bbox, vertical_segments, horizontal_segments, pdf_scalers) + table_bbox, vertical_segments, horizontal_segments, pdf_scalers + ) def _generate_columns_and_rows(self, table_idx, tk): # select elements which lie within table_bbox t_bbox = {} v_s, h_s = segments_in_bbox( - tk, self.vertical_segments, self.horizontal_segments) - t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) - t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + tk, self.vertical_segments, self.horizontal_segments + ) + t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) + t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0)) + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) self.t_bbox = t_bbox @@ -268,23 +314,19 @@ class Lattice(BaseParser): cols.extend([tk[0], tk[2]]) rows.extend([tk[1], tk[3]]) # sort horizontal and vertical segments - cols = merge_close_lines( - sorted(cols), line_tol=self.line_tol) - rows = merge_close_lines( - sorted(rows, reverse=True), line_tol=self.line_tol) + cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) + rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) # make grid using x and y coord of shortlisted rows and cols - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] return cols, rows, v_s, h_s def _generate_table(self, table_idx, cols, rows, **kwargs): - v_s = kwargs.get('v_s') - h_s = kwargs.get('h_s') + v_s = kwargs.get("v_s") + h_s = kwargs.get("h_s") if v_s is None or h_s is None: - raise ValueError('No segments found on {}'.format(self.rootname)) + raise ValueError("No segments found on {}".format(self.rootname)) table = Table(cols, rows) # set table edges to True using ver+hor lines @@ -297,14 +339,21 @@ class Lattice(BaseParser): pos_errors = [] # TODO: have a single list in place of two directional ones? # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ['vertical', 'horizontal']: + for direction in ["vertical", "horizontal"]: for t in self.t_bbox[direction]: indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size, strip_text=self.strip_text) + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) if indices[:2] != (-1, -1): pos_errors.append(error) - indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) + indices = Lattice._reduce_index( + table, indices, shift_text=self.shift_text + ) for r_idx, c_idx, text in indices: table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) @@ -317,11 +366,11 @@ class Lattice(BaseParser): table.shape = table.df.shape whitespace = compute_whitespace(data) - table.flavor = 'lattice' + table.flavor = "lattice" table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace('page-', '')) + table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] @@ -337,15 +386,18 @@ class Lattice(BaseParser): def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): self._generate_layout(filename, layout_kwargs) if not suppress_stdout: - logger.info('Processing {}'.format(os.path.basename(self.rootname))) + logger.info("Processing {}".format(os.path.basename(self.rootname))) if not self.horizontal_text: if self.images: - warnings.warn('{} is image-based, camelot only works on' - ' text-based pages.'.format(os.path.basename(self.rootname))) + warnings.warn( + "{} is image-based, camelot only works on" + " text-based pages.".format(os.path.basename(self.rootname)) + ) else: - warnings.warn('No tables found on {}'.format( - os.path.basename(self.rootname))) + warnings.warn( + "No tables found on {}".format(os.path.basename(self.rootname)) + ) return [] self._generate_image() @@ -353,8 +405,9 @@ class Lattice(BaseParser): _tables = [] # sort tables based on y-coord - for table_idx, tk in enumerate(sorted( - self.table_bbox.keys(), key=lambda x: x[1], reverse=True)): + for table_idx, tk in enumerate( + sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + ): cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table._bbox = tk diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index f36fa40..33f2fe5 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -10,11 +10,10 @@ import pandas as pd from .base import BaseParser from ..core import TextEdges, Table -from ..utils import (text_in_bbox, get_table_index, compute_accuracy, - compute_whitespace) +from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace -logger = logging.getLogger('camelot') +logger = logging.getLogger("camelot") class Stream(BaseParser): @@ -55,9 +54,20 @@ class Stream(BaseParser): to generate columns. """ - def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False, - flag_size=False, strip_text='', edge_tol=50, row_tol=2, - column_tol=0, **kwargs): + + def __init__( + self, + table_regions=None, + table_areas=None, + columns=None, + split_text=False, + flag_size=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + **kwargs + ): self.table_regions = table_regions self.table_areas = table_areas self.columns = columns @@ -150,8 +160,9 @@ class Stream(BaseParser): else: lower = merged[-1] if column_tol >= 0: - if (higher[0] <= lower[1] or - np.isclose(higher[0], lower[1], atol=column_tol)): + if higher[0] <= lower[1] or np.isclose( + higher[0], lower[1], atol=column_tol + ): upper_bound = max(lower[1], higher[1]) lower_bound = min(lower[0], higher[0]) merged[-1] = (lower_bound, upper_bound) @@ -186,13 +197,14 @@ class Stream(BaseParser): List of continuous row y-coordinate tuples. """ - row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) - if len(r) > 0 else 0 for r in rows_grouped] + row_mids = [ + sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 + for r in rows_grouped + ] rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] rows.insert(0, text_y_max) rows.append(text_y_min) - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] return rows @staticmethod @@ -217,8 +229,9 @@ class Stream(BaseParser): if text: text = Stream._group_rows(text, row_tol=row_tol) elements = [len(r) for r in text] - new_cols = [(t.x0, t.x1) - for r in text if len(r) == max(elements) for t in r] + new_cols = [ + (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r + ] cols.extend(Stream._merge_columns(sorted(new_cols))) return cols @@ -243,15 +256,13 @@ class Stream(BaseParser): cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols.insert(0, text_x_min) cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] return cols def _validate_columns(self): if self.table_areas is not None and self.columns is not None: if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" - " should be equal") + raise ValueError("Length of table_areas and columns" " should be equal") def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm @@ -309,16 +320,16 @@ class Stream(BaseParser): def _generate_columns_and_rows(self, table_idx, tk): # select elements which lie within table_bbox t_bbox = {} - t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) - t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) + t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0)) + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) self.t_bbox = t_bbox text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) - rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol) + rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -327,7 +338,7 @@ class Stream(BaseParser): # take (0, pdf_width) by default # similar to else condition # len can't be 1 - cols = self.columns[table_idx].split(',') + cols = self.columns[table_idx].split(",") cols = [float(c) for c in cols] cols.insert(0, text_x_min) cols.append(text_x_max) @@ -346,20 +357,29 @@ class Stream(BaseParser): if len(elements): ncols = max(set(elements), key=elements.count) else: - warnings.warn("No tables found in table area {}".format( - table_idx + 1)) + warnings.warn( + "No tables found in table area {}".format(table_idx + 1) + ) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] right = cols[i][0] - inner_text.extend([t for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right]) - outer_text = [t for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] + ) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] inner_text.extend(outer_text) cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._join_columns(cols, text_x_min, text_x_max) @@ -373,11 +393,16 @@ class Stream(BaseParser): pos_errors = [] # TODO: have a single list in place of two directional ones? # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ['vertical', 'horizontal']: + for direction in ["vertical", "horizontal"]: for t in self.t_bbox[direction]: indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size, strip_text=self.strip_text) + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) if indices[:2] != (-1, -1): pos_errors.append(error) for r_idx, c_idx, text in indices: @@ -389,11 +414,11 @@ class Stream(BaseParser): table.shape = table.df.shape whitespace = compute_whitespace(data) - table.flavor = 'stream' + table.flavor = "stream" table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace('page-', '')) + table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] @@ -409,23 +434,27 @@ class Stream(BaseParser): def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): self._generate_layout(filename, layout_kwargs) if not suppress_stdout: - logger.info('Processing {}'.format(os.path.basename(self.rootname))) + logger.info("Processing {}".format(os.path.basename(self.rootname))) if not self.horizontal_text: if self.images: - warnings.warn('{} is image-based, camelot only works on' - ' text-based pages.'.format(os.path.basename(self.rootname))) + warnings.warn( + "{} is image-based, camelot only works on" + " text-based pages.".format(os.path.basename(self.rootname)) + ) else: - warnings.warn('No tables found on {}'.format( - os.path.basename(self.rootname))) + warnings.warn( + "No tables found on {}".format(os.path.basename(self.rootname)) + ) return [] self._generate_table_bbox() _tables = [] # sort tables based on y-coord - for table_idx, tk in enumerate(sorted( - self.table_bbox.keys(), key=lambda x: x[1], reverse=True)): + for table_idx, tk in enumerate( + sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + ): cols, rows = self._generate_columns_and_rows(table_idx, tk) table = self._generate_table(table_idx, cols, rows) table._bbox = tk diff --git a/camelot/plotting.py b/camelot/plotting.py index 1320267..5e0dc0c 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -10,7 +10,7 @@ else: class PlotMethods(object): - def __call__(self, table, kind='text', filename=None): + def __call__(self, table, kind="text", filename=None): """Plot elements found on PDF page based on kind specified, useful for debugging and playing with different parameters to get the best output. @@ -31,14 +31,16 @@ class PlotMethods(object): """ if not _HAS_MPL: - raise ImportError('matplotlib is required for plotting.') + raise ImportError("matplotlib is required for plotting.") - if table.flavor == 'lattice' and kind in ['textedge']: - raise NotImplementedError("Lattice flavor does not support kind='{}'".format( - kind)) - elif table.flavor == 'stream' and kind in ['joint', 'line']: - raise NotImplementedError("Stream flavor does not support kind='{}'".format( - kind)) + if table.flavor == "lattice" and kind in ["textedge"]: + raise NotImplementedError( + "Lattice flavor does not support kind='{}'".format(kind) + ) + elif table.flavor == "stream" and kind in ["joint", "line"]: + raise NotImplementedError( + "Stream flavor does not support kind='{}'".format(kind) + ) plot_method = getattr(self, kind) return plot_method(table) @@ -57,18 +59,12 @@ class PlotMethods(object): """ fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") xs, ys = [], [] for t in table._text: xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1] - ) - ) + ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) return fig @@ -87,21 +83,17 @@ class PlotMethods(object): """ fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") for row in table.cells: for cell in row: if cell.left: - ax.plot([cell.lb[0], cell.lt[0]], - [cell.lb[1], cell.lt[1]]) + ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]]) if cell.right: - ax.plot([cell.rb[0], cell.rt[0]], - [cell.rb[1], cell.rt[1]]) + ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]]) if cell.top: - ax.plot([cell.lt[0], cell.rt[0]], - [cell.lt[1], cell.rt[1]]) + ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: - ax.plot([cell.lb[0], cell.rb[0]], - [cell.lb[1], cell.rb[1]]) + ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) return fig def contour(self, table): @@ -124,7 +116,7 @@ class PlotMethods(object): img, table_bbox = (None, {table._bbox: None}) _FOR_LATTICE = False fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") xs, ys = [], [] if not _FOR_LATTICE: @@ -133,21 +125,14 @@ class PlotMethods(object): ys.extend([t[1], t[3]]) ax.add_patch( patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1], - color='blue' + (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue" ) ) for t in table_bbox.keys(): ax.add_patch( patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1], - fill=False, - color='red' + (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" ) ) if not _FOR_LATTICE: @@ -173,25 +158,19 @@ class PlotMethods(object): """ fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") xs, ys = [], [] for t in table._text: xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1], - color='blue' - ) + patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) for te in table._textedges: - ax.plot([te.x, te.x], - [te.y0, te.y1]) + ax.plot([te.x, te.x], [te.y0, te.y1]) return fig @@ -210,14 +189,14 @@ class PlotMethods(object): """ img, table_bbox = table._image fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") x_coord = [] y_coord = [] for k in table_bbox.keys(): for coord in table_bbox[k]: x_coord.append(coord[0]) y_coord.append(coord[1]) - ax.plot(x_coord, y_coord, 'ro') + ax.plot(x_coord, y_coord, "ro") ax.imshow(img) return fig @@ -235,7 +214,7 @@ class PlotMethods(object): """ fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') + ax = fig.add_subplot(111, aspect="equal") vertical, horizontal = table._segments for v in vertical: ax.plot([v[0], v[2]], [v[1], v[3]]) diff --git a/camelot/utils.py b/camelot/utils.py index 48e39af..2051c45 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -19,8 +19,14 @@ from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, - LTTextLineVertical, LTImage) +from pdfminer.layout import ( + LAParams, + LTAnno, + LTChar, + LTTextLineHorizontal, + LTTextLineVertical, + LTImage, +) PY3 = sys.version_info[0] >= 3 @@ -35,7 +41,7 @@ else: _VALID_URLS = set(uses_relative + uses_netloc + uses_params) -_VALID_URLS.discard('') +_VALID_URLS.discard("") # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py @@ -59,9 +65,11 @@ def is_url(url): def random_string(length): - ret = '' + ret = "" while length: - ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase) + ret += random.choice( + string.digits + string.ascii_lowercase + string.ascii_uppercase + ) length -= 1 return ret @@ -79,14 +87,14 @@ def download_url(url): Temporary filepath. """ - filename = '{}.pdf'.format(random_string(6)) - with tempfile.NamedTemporaryFile('wb', delete=False) as f: + filename = "{}.pdf".format(random_string(6)) + with tempfile.NamedTemporaryFile("wb", delete=False) as f: obj = urlopen(url) if PY3: content_type = obj.info().get_content_type() else: - content_type = obj.info().getheader('Content-Type') - if content_type != 'application/pdf': + content_type = obj.info().getheader("Content-Type") + if content_type != "application/pdf": raise NotImplementedError("File format not supported") f.write(obj.read()) filepath = os.path.join(os.path.dirname(f.name), filename) @@ -94,39 +102,38 @@ def download_url(url): return filepath -stream_kwargs = [ - 'columns', - 'row_tol', - 'column_tol' -] +stream_kwargs = ["columns", "row_tol", "column_tol"] lattice_kwargs = [ - 'process_background', - 'line_scale', - 'copy_text', - 'shift_text', - 'line_tol', - 'joint_tol', - 'threshold_blocksize', - 'threshold_constant', - 'iterations' + "process_background", + "line_scale", + "copy_text", + "shift_text", + "line_tol", + "joint_tol", + "threshold_blocksize", + "threshold_constant", + "iterations", ] -def validate_input(kwargs, flavor='lattice'): +def validate_input(kwargs, flavor="lattice"): def check_intersection(parser_kwargs, input_kwargs): isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) if isec: - raise ValueError("{} cannot be used with flavor='{}'".format( - ",".join(sorted(isec)), flavor)) + raise ValueError( + "{} cannot be used with flavor='{}'".format( + ",".join(sorted(isec)), flavor + ) + ) - if flavor == 'lattice': + if flavor == "lattice": check_intersection(stream_kwargs, kwargs) else: check_intersection(lattice_kwargs, kwargs) -def remove_extra(kwargs, flavor='lattice'): - if flavor == 'lattice': +def remove_extra(kwargs, flavor="lattice"): + if flavor == "lattice": for key in kwargs.keys(): if key in stream_kwargs: kwargs.pop(key) @@ -256,15 +263,19 @@ def scale_image(tables, v_segments, h_segments, factors): v_segments_new = [] for v in v_segments: x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x) - y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale( - abs(translate(-img_y, v[3])), scaling_factor_y) + y1, y2 = ( + scale(abs(translate(-img_y, v[1])), scaling_factor_y), + scale(abs(translate(-img_y, v[3])), scaling_factor_y), + ) v_segments_new.append((x1, y1, x2, y2)) h_segments_new = [] for h in h_segments: x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x) - y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale( - abs(translate(-img_y, h[3])), scaling_factor_y) + y1, y2 = ( + scale(abs(translate(-img_y, h[1])), scaling_factor_y), + scale(abs(translate(-img_y, h[3])), scaling_factor_y), + ) h_segments_new.append((x1, y1, x2, y2)) return tables_new, v_segments_new, h_segments_new @@ -291,13 +302,13 @@ def get_rotation(chars, horizontal_text, vertical_text): rotated 90 degree clockwise. """ - rotation = '' + rotation = "" hlen = len([t for t in horizontal_text if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()]) if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) - rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' + rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" return rotation @@ -325,10 +336,16 @@ def segments_in_bbox(bbox, v_segments, h_segments): """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) - v_s = [v for v in v_segments if v[1] > lb[1] - 2 and - v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] - h_s = [h for h in h_segments if h[0] > lb[0] - 2 and - h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] + v_s = [ + v + for v in v_segments + if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 + ] + h_s = [ + h + for h in h_segments + if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 + ] return v_s, h_s @@ -351,9 +368,12 @@ def text_in_bbox(bbox, text): """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) - t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 - <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 - <= rt[1] + 2] + t_bbox = [ + t + for t in text + if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2 + and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2 + ] return t_bbox @@ -390,7 +410,7 @@ def merge_close_lines(ar, line_tol=2): # (inspired from sklearn.pipeline.Pipeline) -def flag_font_size(textline, direction, strip_text=''): +def flag_font_size(textline, direction, strip_text=""): """Flags super/subscripts in text by enclosing them with . May give false positives. @@ -409,10 +429,18 @@ def flag_font_size(textline, direction, strip_text=''): fstring : string """ - if direction == 'horizontal': - d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] - elif direction == 'vertical': - d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)] + if direction == "horizontal": + d = [ + (t.get_text(), np.round(t.height, decimals=6)) + for t in textline + if not isinstance(t, LTAnno) + ] + elif direction == "vertical": + d = [ + (t.get_text(), np.round(t.width, decimals=6)) + for t in textline + if not isinstance(t, LTAnno) + ] l = [np.round(size, decimals=6) for text, size in d] if len(set(l)) > 1: flist = [] @@ -420,21 +448,21 @@ def flag_font_size(textline, direction, strip_text=''): for key, chars in groupby(d, itemgetter(1)): if key == min_size: fchars = [t[0] for t in chars] - if ''.join(fchars).strip(): - fchars.insert(0, '') - fchars.append('') - flist.append(''.join(fchars)) + if "".join(fchars).strip(): + fchars.insert(0, "") + fchars.append("") + flist.append("".join(fchars)) else: fchars = [t[0] for t in chars] - if ''.join(fchars).strip(): - flist.append(''.join(fchars)) - fstring = ''.join(flist).strip(strip_text) + if "".join(fchars).strip(): + flist.append("".join(fchars)) + fstring = "".join(flist).strip(strip_text) else: - fstring = ''.join([t.get_text() for t in textline]).strip(strip_text) + fstring = "".join([t.get_text() for t in textline]).strip(strip_text) return fstring -def split_textline(table, textline, direction, flag_size=False, strip_text=''): +def split_textline(table, textline, direction, flag_size=False, strip_text=""): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -464,19 +492,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''): cut_text = [] bbox = textline.bbox try: - if direction == 'horizontal' and not textline.is_empty(): - x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] - r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] + if direction == "horizontal" and not textline.is_empty(): + x_overlap = [ + i + for i, x in enumerate(table.cols) + if x[0] <= bbox[2] and bbox[0] <= x[1] + ] + r_idx = [ + j + for j, r in enumerate(table.rows) + if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0] + ] r = r_idx[0] - x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] + x_cuts = [ + (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right + ] if not x_cuts: x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] for obj in textline._objs: row = table.rows[r] for cut in x_cuts: if isinstance(obj, LTChar): - if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and - (obj.x0 + obj.x1) / 2 <= cut[1]): + if ( + row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] + and (obj.x0 + obj.x1) / 2 <= cut[1] + ): cut_text.append((r, cut[0], obj)) break else: @@ -485,19 +525,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''): cut_text.append((r, cut[0] + 1, obj)) elif isinstance(obj, LTAnno): cut_text.append((r, cut[0], obj)) - elif direction == 'vertical' and not textline.is_empty(): - y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] - c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] + elif direction == "vertical" and not textline.is_empty(): + y_overlap = [ + j + for j, y in enumerate(table.rows) + if y[1] <= bbox[3] and bbox[1] <= y[0] + ] + c_idx = [ + i + for i, c in enumerate(table.cols) + if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1] + ] c = c_idx[0] - y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] + y_cuts = [ + (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom + ] if not y_cuts: y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] for obj in textline._objs: col = table.cols[c] for cut in y_cuts: if isinstance(obj, LTChar): - if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and - (obj.y0 + obj.y1) / 2 >= cut[1]): + if ( + col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] + and (obj.y0 + obj.y1) / 2 >= cut[1] + ): cut_text.append((cut[0], c, obj)) break else: @@ -511,15 +563,24 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''): grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: - grouped_chars.append((key[0], key[1], - flag_font_size([t[2] for t in chars], direction, strip_text=strip_text))) + grouped_chars.append( + ( + key[0], + key[1], + flag_font_size( + [t[2] for t in chars], direction, strip_text=strip_text + ), + ) + ) else: gchars = [t[2].get_text() for t in chars] - grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text))) + grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text))) return grouped_chars -def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',): +def get_table_index( + table, t, direction, split_text=False, flag_size=False, strip_text="" +): """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. @@ -558,8 +619,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): - if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and - (t.y0 + t.y1) / 2.0 > table.rows[r][1]): + if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ + r + ][1]: lt_col_overlap = [] for c in table.cols: if c[0] <= t.x1 and c[1] >= t.x0: @@ -569,11 +631,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri else: lt_col_overlap.append(-1) if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0: - text = t.get_text().strip('\n') + text = t.get_text().strip("\n") text_range = (t.x0, t.x1) col_range = (table.cols[0][0], table.cols[-1][1]) - warnings.warn("{} {} does not lie in column range {}".format( - text, text_range, col_range)) + warnings.warn( + "{} {} does not lie in column range {}".format( + text, text_range, col_range + ) + ) r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) break @@ -594,10 +659,24 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea if split_text: - return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error + return ( + split_textline( + table, t, direction, flag_size=flag_size, strip_text=strip_text + ), + error, + ) else: if flag_size: - return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error + return ( + [ + ( + r_idx, + c_idx, + flag_font_size(t._objs, direction, strip_text=strip_text), + ) + ], + error, + ) else: return [(r_idx, c_idx, t.get_text().strip(strip_text))], error @@ -650,14 +729,20 @@ def compute_whitespace(d): r_nempty_cells, c_nempty_cells = [], [] for i in d: for j in i: - if j.strip() == '': + if j.strip() == "": whitespace += 1 whitespace = 100 * (whitespace / float(len(d) * len(d[0]))) return whitespace -def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, - detect_vertical=True, all_texts=True): +def get_page_layout( + filename, + char_margin=1.0, + line_margin=0.5, + word_margin=0.1, + detect_vertical=True, + all_texts=True, +): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. @@ -680,16 +765,18 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, Dimension of pdf page in the form (width, height). """ - with open(filename, 'rb') as f: + with open(filename, "rb") as f: parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed - laparams = LAParams(char_margin=char_margin, - line_margin=line_margin, - word_margin=word_margin, - detect_vertical=detect_vertical, - all_texts=all_texts) + laparams = LAParams( + char_margin=char_margin, + line_margin=line_margin, + word_margin=word_margin, + detect_vertical=detect_vertical, + all_texts=all_texts, + ) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) @@ -721,13 +808,13 @@ def get_text_objects(layout, ltype="char", t=None): List of PDFMiner text objects. """ - if ltype == 'char': + if ltype == "char": LTObject = LTChar - elif ltype == 'image': + elif ltype == "image": LTObject = LTImage - elif ltype == 'horizontal_text': + elif ltype == "horizontal_text": LTObject = LTTextLineHorizontal - elif ltype == 'vertical_text': + elif ltype == "vertical_text": LTObject = LTTextLineVertical if t is None: t = []