Blacken code
parent
27d55d056c
commit
2115a0e177
|
|
@ -9,8 +9,8 @@ from .io import read_pdf
|
|||
from .plotting import PlotMethods
|
||||
|
||||
|
||||
def _write_usage(self, prog, args='', prefix='Usage: '):
|
||||
return self._write_usage('camelot', args, prefix=prefix)
|
||||
def _write_usage(self, prog, args="", prefix="Usage: "):
|
||||
return self._write_usage("camelot", args, prefix=prefix)
|
||||
|
||||
|
||||
# monkey patch click.HelpFormatter
|
||||
|
|
@ -18,10 +18,10 @@ HelpFormatter._write_usage = HelpFormatter.write_usage
|
|||
HelpFormatter.write_usage = _write_usage
|
||||
|
||||
# set up logging
|
||||
logger = logging.getLogger('camelot')
|
||||
logger = logging.getLogger("camelot")
|
||||
|
||||
format_string = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
format_string = "%(asctime)s - %(levelname)s - %(message)s"
|
||||
formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
|
||||
__all__ = ('main',)
|
||||
__all__ = ("main",)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
|||
|
|
@ -1,23 +1,23 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
VERSION = (0, 7, 2)
|
||||
PRERELEASE = None # alpha, beta or rc
|
||||
PRERELEASE = None # alpha, beta or rc
|
||||
REVISION = None
|
||||
|
||||
|
||||
def generate_version(version, prerelease=None, revision=None):
|
||||
version_parts = ['.'.join(map(str, version))]
|
||||
version_parts = [".".join(map(str, version))]
|
||||
if prerelease is not None:
|
||||
version_parts.append('-{}'.format(prerelease))
|
||||
version_parts.append("-{}".format(prerelease))
|
||||
if revision is not None:
|
||||
version_parts.append('.{}'.format(revision))
|
||||
return ''.join(version_parts)
|
||||
version_parts.append(".{}".format(revision))
|
||||
return "".join(version_parts)
|
||||
|
||||
|
||||
__title__ = 'camelot-py'
|
||||
__description__ = 'PDF Table Extraction for Humans.'
|
||||
__url__ = 'http://camelot-py.readthedocs.io/'
|
||||
__title__ = "camelot-py"
|
||||
__description__ = "PDF Table Extraction for Humans."
|
||||
__url__ = "http://camelot-py.readthedocs.io/"
|
||||
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
|
||||
__author__ = 'Vinayak Mehta'
|
||||
__author_email__ = 'vmehta94@gmail.com'
|
||||
__license__ = 'MIT License'
|
||||
__author__ = "Vinayak Mehta"
|
||||
__author_email__ = "vmehta94@gmail.com"
|
||||
__license__ = "MIT License"
|
||||
|
|
|
|||
341
camelot/cli.py
341
camelot/cli.py
|
|
@ -3,6 +3,7 @@
|
|||
import logging
|
||||
|
||||
import click
|
||||
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except ImportError:
|
||||
|
|
@ -13,7 +14,7 @@ else:
|
|||
from . import __version__, read_pdf, plot
|
||||
|
||||
|
||||
logger = logging.getLogger('camelot')
|
||||
logger = logging.getLogger("camelot")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
|
|
@ -30,23 +31,47 @@ pass_config = click.make_pass_decorator(Config)
|
|||
|
||||
@click.group()
|
||||
@click.version_option(version=__version__)
|
||||
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
|
||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||
' Example: 1,3,4 or 1,4-end or all.')
|
||||
@click.option('-pw', '--password', help='Password for decryption.')
|
||||
@click.option('-o', '--output', help='Output file path.')
|
||||
@click.option('-f', '--format',
|
||||
type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
|
||||
help='Output file format.')
|
||||
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
||||
@click.option('-split', '--split_text', is_flag=True,
|
||||
help='Split text that spans across multiple cells.')
|
||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||
' font size. Useful to detect super/subscripts.')
|
||||
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
|
||||
' assigning it to a cell.')
|
||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
|
||||
@click.option(
|
||||
"-p",
|
||||
"--pages",
|
||||
default="1",
|
||||
help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
|
||||
)
|
||||
@click.option("-pw", "--password", help="Password for decryption.")
|
||||
@click.option("-o", "--output", help="Output file path.")
|
||||
@click.option(
|
||||
"-f",
|
||||
"--format",
|
||||
type=click.Choice(["csv", "json", "excel", "html", "sqlite"]),
|
||||
help="Output file format.",
|
||||
)
|
||||
@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
|
||||
@click.option(
|
||||
"-split",
|
||||
"--split_text",
|
||||
is_flag=True,
|
||||
help="Split text that spans across multiple cells.",
|
||||
)
|
||||
@click.option(
|
||||
"-flag",
|
||||
"--flag_size",
|
||||
is_flag=True,
|
||||
help="Flag text based on" " font size. Useful to detect super/subscripts.",
|
||||
)
|
||||
@click.option(
|
||||
"-strip",
|
||||
"--strip_text",
|
||||
help="Characters that should be stripped from a string before"
|
||||
" assigning it to a cell.",
|
||||
)
|
||||
@click.option(
|
||||
"-M",
|
||||
"--margins",
|
||||
nargs=3,
|
||||
default=(1.0, 0.5, 0.1),
|
||||
help="PDFMiner char_margin, line_margin and word_margin.",
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx, *args, **kwargs):
|
||||
"""Camelot: PDF Table Extraction for Humans"""
|
||||
|
|
@ -55,79 +80,131 @@ def cli(ctx, *args, **kwargs):
|
|||
ctx.obj.set_config(key, value)
|
||||
|
||||
|
||||
@cli.command('lattice')
|
||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-back', '--process_background', is_flag=True,
|
||||
help='Process background lines.')
|
||||
@click.option('-scale', '--line_scale', default=15,
|
||||
help='Line size scaling factor. The larger the value,'
|
||||
' the smaller the detected lines.')
|
||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
||||
multiple=True, help='Direction in which text in a spanning cell'
|
||||
' will be copied over.')
|
||||
@click.option('-shift', '--shift_text', default=['l', 't'],
|
||||
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
||||
help='Direction in which text in a spanning cell will flow.')
|
||||
@click.option('-l', '--line_tol', default=2,
|
||||
help='Tolerance parameter used to merge close vertical'
|
||||
' and horizontal lines.')
|
||||
@click.option('-j', '--joint_tol', default=2,
|
||||
help='Tolerance parameter used to decide whether'
|
||||
' the detected lines and points lie close to each other.')
|
||||
@click.option('-block', '--threshold_blocksize', default=15,
|
||||
help='For adaptive thresholding, size of a pixel'
|
||||
' neighborhood that is used to calculate a threshold value for'
|
||||
' the pixel. Example: 3, 5, 7, and so on.')
|
||||
@click.option('-const', '--threshold_constant', default=-2,
|
||||
help='For adaptive thresholding, constant subtracted'
|
||||
' from the mean or weighted mean. Normally, it is positive but'
|
||||
' may be zero or negative as well.')
|
||||
@click.option('-I', '--iterations', default=0,
|
||||
help='Number of times for erosion/dilation will be applied.')
|
||||
@click.option('-res', '--resolution', default=300,
|
||||
help='Resolution used for PDF to PNG conversion.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
|
||||
help='Plot elements found on PDF page for visual debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@cli.command("lattice")
|
||||
@click.option(
|
||||
"-R",
|
||||
"--table_regions",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-T",
|
||||
"--table_areas",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-back", "--process_background", is_flag=True, help="Process background lines."
|
||||
)
|
||||
@click.option(
|
||||
"-scale",
|
||||
"--line_scale",
|
||||
default=15,
|
||||
help="Line size scaling factor. The larger the value,"
|
||||
" the smaller the detected lines.",
|
||||
)
|
||||
@click.option(
|
||||
"-copy",
|
||||
"--copy_text",
|
||||
default=[],
|
||||
type=click.Choice(["h", "v"]),
|
||||
multiple=True,
|
||||
help="Direction in which text in a spanning cell" " will be copied over.",
|
||||
)
|
||||
@click.option(
|
||||
"-shift",
|
||||
"--shift_text",
|
||||
default=["l", "t"],
|
||||
type=click.Choice(["", "l", "r", "t", "b"]),
|
||||
multiple=True,
|
||||
help="Direction in which text in a spanning cell will flow.",
|
||||
)
|
||||
@click.option(
|
||||
"-l",
|
||||
"--line_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
|
||||
)
|
||||
@click.option(
|
||||
"-j",
|
||||
"--joint_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter used to decide whether"
|
||||
" the detected lines and points lie close to each other.",
|
||||
)
|
||||
@click.option(
|
||||
"-block",
|
||||
"--threshold_blocksize",
|
||||
default=15,
|
||||
help="For adaptive thresholding, size of a pixel"
|
||||
" neighborhood that is used to calculate a threshold value for"
|
||||
" the pixel. Example: 3, 5, 7, and so on.",
|
||||
)
|
||||
@click.option(
|
||||
"-const",
|
||||
"--threshold_constant",
|
||||
default=-2,
|
||||
help="For adaptive thresholding, constant subtracted"
|
||||
" from the mean or weighted mean. Normally, it is positive but"
|
||||
" may be zero or negative as well.",
|
||||
)
|
||||
@click.option(
|
||||
"-I",
|
||||
"--iterations",
|
||||
default=0,
|
||||
help="Number of times for erosion/dilation will be applied.",
|
||||
)
|
||||
@click.option(
|
||||
"-res",
|
||||
"--resolution",
|
||||
default=300,
|
||||
help="Resolution used for PDF to PNG conversion.",
|
||||
)
|
||||
@click.option(
|
||||
"-plot",
|
||||
"--plot_type",
|
||||
type=click.Choice(["text", "grid", "contour", "joint", "line"]),
|
||||
help="Plot elements found on PDF page for visual debugging.",
|
||||
)
|
||||
@click.argument("filepath", type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def lattice(c, *args, **kwargs):
|
||||
"""Use lines between text to parse the table."""
|
||||
conf = c.config
|
||||
pages = conf.pop('pages')
|
||||
output = conf.pop('output')
|
||||
f = conf.pop('format')
|
||||
compress = conf.pop('zip')
|
||||
quiet = conf.pop('quiet')
|
||||
plot_type = kwargs.pop('plot_type')
|
||||
filepath = kwargs.pop('filepath')
|
||||
pages = conf.pop("pages")
|
||||
output = conf.pop("output")
|
||||
f = conf.pop("format")
|
||||
compress = conf.pop("zip")
|
||||
quiet = conf.pop("quiet")
|
||||
plot_type = kwargs.pop("plot_type")
|
||||
filepath = kwargs.pop("filepath")
|
||||
kwargs.update(conf)
|
||||
|
||||
table_regions = list(kwargs['table_regions'])
|
||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs['table_areas'])
|
||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||
copy_text = list(kwargs['copy_text'])
|
||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||
table_regions = list(kwargs["table_regions"])
|
||||
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs["table_areas"])
|
||||
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||
copy_text = list(kwargs["copy_text"])
|
||||
kwargs["copy_text"] = None if not copy_text else copy_text
|
||||
kwargs["shift_text"] = list(kwargs["shift_text"])
|
||||
|
||||
if plot_type is not None:
|
||||
if not _HAS_MPL:
|
||||
raise ImportError('matplotlib is required for plotting.')
|
||||
raise ImportError("matplotlib is required for plotting.")
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
raise click.UsageError("Please specify output file path using --output")
|
||||
if f is None:
|
||||
raise click.UsageError('Please specify output file format using --format')
|
||||
raise click.UsageError("Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(filepath, pages=pages, flavor='lattice',
|
||||
suppress_stdout=quiet, **kwargs)
|
||||
click.echo('Found {} tables'.format(tables.n))
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
||||
)
|
||||
click.echo("Found {} tables".format(tables.n))
|
||||
if plot_type is not None:
|
||||
for table in tables:
|
||||
plot(table, kind=plot_type)
|
||||
|
|
@ -136,57 +213,89 @@ def lattice(c, *args, **kwargs):
|
|||
tables.export(output, f=f, compress=compress)
|
||||
|
||||
|
||||
@cli.command('stream')
|
||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-C', '--columns', default=[], multiple=True,
|
||||
help='X coordinates of column separators.')
|
||||
@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
|
||||
' for extending textedges vertically.')
|
||||
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
|
||||
' used to combine text vertically, to generate rows.')
|
||||
@click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
|
||||
' used to combine text horizontally, to generate columns.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
||||
help='Plot elements found on PDF page for visual debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@cli.command("stream")
|
||||
@click.option(
|
||||
"-R",
|
||||
"--table_regions",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-T",
|
||||
"--table_areas",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-C",
|
||||
"--columns",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="X coordinates of column separators.",
|
||||
)
|
||||
@click.option(
|
||||
"-e",
|
||||
"--edge_tol",
|
||||
default=50,
|
||||
help="Tolerance parameter" " for extending textedges vertically.",
|
||||
)
|
||||
@click.option(
|
||||
"-r",
|
||||
"--row_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--column_tol",
|
||||
default=0,
|
||||
help="Tolerance parameter"
|
||||
" used to combine text horizontally, to generate columns.",
|
||||
)
|
||||
@click.option(
|
||||
"-plot",
|
||||
"--plot_type",
|
||||
type=click.Choice(["text", "grid", "contour", "textedge"]),
|
||||
help="Plot elements found on PDF page for visual debugging.",
|
||||
)
|
||||
@click.argument("filepath", type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def stream(c, *args, **kwargs):
|
||||
"""Use spaces between text to parse the table."""
|
||||
conf = c.config
|
||||
pages = conf.pop('pages')
|
||||
output = conf.pop('output')
|
||||
f = conf.pop('format')
|
||||
compress = conf.pop('zip')
|
||||
quiet = conf.pop('quiet')
|
||||
plot_type = kwargs.pop('plot_type')
|
||||
filepath = kwargs.pop('filepath')
|
||||
pages = conf.pop("pages")
|
||||
output = conf.pop("output")
|
||||
f = conf.pop("format")
|
||||
compress = conf.pop("zip")
|
||||
quiet = conf.pop("quiet")
|
||||
plot_type = kwargs.pop("plot_type")
|
||||
filepath = kwargs.pop("filepath")
|
||||
kwargs.update(conf)
|
||||
|
||||
table_regions = list(kwargs['table_regions'])
|
||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs['table_areas'])
|
||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||
columns = list(kwargs['columns'])
|
||||
kwargs['columns'] = None if not columns else columns
|
||||
table_regions = list(kwargs["table_regions"])
|
||||
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs["table_areas"])
|
||||
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||
columns = list(kwargs["columns"])
|
||||
kwargs["columns"] = None if not columns else columns
|
||||
|
||||
if plot_type is not None:
|
||||
if not _HAS_MPL:
|
||||
raise ImportError('matplotlib is required for plotting.')
|
||||
raise ImportError("matplotlib is required for plotting.")
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
raise click.UsageError("Please specify output file path using --output")
|
||||
if f is None:
|
||||
raise click.UsageError('Please specify output file format using --format')
|
||||
raise click.UsageError("Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(filepath, pages=pages, flavor='stream',
|
||||
suppress_stdout=quiet, **kwargs)
|
||||
click.echo('Found {} tables'.format(tables.n))
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
||||
)
|
||||
click.echo("Found {} tables".format(tables.n))
|
||||
if plot_type is not None:
|
||||
for table in tables:
|
||||
plot(table, kind=plot_type)
|
||||
|
|
|
|||
201
camelot/core.py
201
camelot/core.py
|
|
@ -42,7 +42,8 @@ class TextEdge(object):
|
|||
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
||||
|
||||
"""
|
||||
def __init__(self, x, y0, y1, align='left'):
|
||||
|
||||
def __init__(self, x, y0, y1, align="left"):
|
||||
self.x = x
|
||||
self.y0 = y0
|
||||
self.y1 = y1
|
||||
|
|
@ -51,8 +52,13 @@ class TextEdge(object):
|
|||
self.is_valid = False
|
||||
|
||||
def __repr__(self):
|
||||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||
return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
|
||||
round(self.x, 2),
|
||||
round(self.y0, 2),
|
||||
round(self.y1, 2),
|
||||
self.align,
|
||||
self.is_valid,
|
||||
)
|
||||
|
||||
def update_coords(self, x, y0, edge_tol=50):
|
||||
"""Updates the text edge's x and bottom y coordinates and sets
|
||||
|
|
@ -73,9 +79,10 @@ class TextEdges(object):
|
|||
the PDF page. The dict has three keys based on the alignments,
|
||||
and each key's value is a list of camelot.core.TextEdge objects.
|
||||
"""
|
||||
|
||||
def __init__(self, edge_tol=50):
|
||||
self.edge_tol = edge_tol
|
||||
self._textedges = {'left': [], 'right': [], 'middle': []}
|
||||
self._textedges = {"left": [], "right": [], "middle": []}
|
||||
|
||||
@staticmethod
|
||||
def get_x_coord(textline, align):
|
||||
|
|
@ -85,7 +92,7 @@ class TextEdges(object):
|
|||
x_left = textline.x0
|
||||
x_right = textline.x1
|
||||
x_middle = x_left + (x_right - x_left) / 2.0
|
||||
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
|
||||
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
|
||||
return x_coord[align]
|
||||
|
||||
def find(self, x_coord, align):
|
||||
|
|
@ -109,21 +116,22 @@ class TextEdges(object):
|
|||
def update(self, textline):
|
||||
"""Updates an existing text edge in the current dict.
|
||||
"""
|
||||
for align in ['left', 'right', 'middle']:
|
||||
for align in ["left", "right", "middle"]:
|
||||
x_coord = self.get_x_coord(textline, align)
|
||||
idx = self.find(x_coord, align)
|
||||
if idx is None:
|
||||
self.add(textline, align)
|
||||
else:
|
||||
self._textedges[align][idx].update_coords(
|
||||
x_coord, textline.y0, edge_tol=self.edge_tol)
|
||||
x_coord, textline.y0, edge_tol=self.edge_tol
|
||||
)
|
||||
|
||||
def generate(self, textlines):
|
||||
"""Generates the text edges dict based on horizontal text
|
||||
rows.
|
||||
"""
|
||||
for tl in textlines:
|
||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||
self.update(tl)
|
||||
|
||||
def get_relevant(self):
|
||||
|
|
@ -132,9 +140,15 @@ class TextEdges(object):
|
|||
the most.
|
||||
"""
|
||||
intersections_sum = {
|
||||
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
|
||||
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
|
||||
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
|
||||
"left": sum(
|
||||
te.intersections for te in self._textedges["left"] if te.is_valid
|
||||
),
|
||||
"right": sum(
|
||||
te.intersections for te in self._textedges["right"] if te.is_valid
|
||||
),
|
||||
"middle": sum(
|
||||
te.intersections for te in self._textedges["middle"] if te.is_valid
|
||||
),
|
||||
}
|
||||
|
||||
# TODO: naive
|
||||
|
|
@ -147,6 +161,7 @@ class TextEdges(object):
|
|||
"""Returns a dict of interesting table areas on the PDF page
|
||||
calculated using relevant text edges.
|
||||
"""
|
||||
|
||||
def pad(area, average_row_height):
|
||||
x0 = area[0] - TABLE_AREA_PADDING
|
||||
y0 = area[1] - TABLE_AREA_PADDING
|
||||
|
|
@ -175,7 +190,11 @@ class TextEdges(object):
|
|||
else:
|
||||
table_areas.pop(found)
|
||||
updated_area = (
|
||||
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
|
||||
found[0],
|
||||
min(te.y0, found[1]),
|
||||
max(found[2], te.x),
|
||||
max(found[3], te.y1),
|
||||
)
|
||||
table_areas[updated_area] = None
|
||||
|
||||
# extend table areas based on textlines that overlap
|
||||
|
|
@ -196,7 +215,11 @@ class TextEdges(object):
|
|||
if found is not None:
|
||||
table_areas.pop(found)
|
||||
updated_area = (
|
||||
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
|
||||
min(tl.x0, found[0]),
|
||||
min(tl.y0, found[1]),
|
||||
max(found[2], tl.x1),
|
||||
max(found[3], tl.y1),
|
||||
)
|
||||
table_areas[updated_area] = None
|
||||
average_textline_height = sum_textline_height / float(len(textlines))
|
||||
|
||||
|
|
@ -265,11 +288,12 @@ class Cell(object):
|
|||
self.bottom = False
|
||||
self.hspan = False
|
||||
self.vspan = False
|
||||
self._text = ''
|
||||
self._text = ""
|
||||
|
||||
def __repr__(self):
|
||||
return '<Cell x1={} y1={} x2={} y2={}>'.format(
|
||||
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2))
|
||||
return "<Cell x1={} y1={} x2={} y2={}>".format(
|
||||
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
|
||||
)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
|
|
@ -277,7 +301,7 @@ class Cell(object):
|
|||
|
||||
@text.setter
|
||||
def text(self, t):
|
||||
self._text = ''.join([self._text, t])
|
||||
self._text = "".join([self._text, t])
|
||||
|
||||
@property
|
||||
def bound(self):
|
||||
|
|
@ -314,11 +338,11 @@ class Table(object):
|
|||
PDF page number.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, cols, rows):
|
||||
self.cols = cols
|
||||
self.rows = rows
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||
for c in cols] for r in rows]
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
|
||||
self.df = None
|
||||
self.shape = (0, 0)
|
||||
self.accuracy = 0
|
||||
|
|
@ -327,7 +351,7 @@ class Table(object):
|
|||
self.page = None
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} shape={}>'.format(self.__class__.__name__, self.shape)
|
||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||
|
||||
def __lt__(self, other):
|
||||
if self.page == other.page:
|
||||
|
|
@ -352,10 +376,10 @@ class Table(object):
|
|||
"""
|
||||
# pretty?
|
||||
report = {
|
||||
'accuracy': round(self.accuracy, 2),
|
||||
'whitespace': round(self.whitespace, 2),
|
||||
'order': self.order,
|
||||
'page': self.page
|
||||
"accuracy": round(self.accuracy, 2),
|
||||
"whitespace": round(self.whitespace, 2),
|
||||
"order": self.order,
|
||||
"page": self.page,
|
||||
}
|
||||
return report
|
||||
|
||||
|
|
@ -383,12 +407,21 @@ class Table(object):
|
|||
for v in vertical:
|
||||
# find closest x coord
|
||||
# iterate over y coords and find closest start and end points
|
||||
i = [i for i, t in enumerate(self.cols)
|
||||
if np.isclose(v[0], t[0], atol=joint_tol)]
|
||||
j = [j for j, t in enumerate(self.rows)
|
||||
if np.isclose(v[3], t[0], atol=joint_tol)]
|
||||
k = [k for k, t in enumerate(self.rows)
|
||||
if np.isclose(v[1], t[0], atol=joint_tol)]
|
||||
i = [
|
||||
i
|
||||
for i, t in enumerate(self.cols)
|
||||
if np.isclose(v[0], t[0], atol=joint_tol)
|
||||
]
|
||||
j = [
|
||||
j
|
||||
for j, t in enumerate(self.rows)
|
||||
if np.isclose(v[3], t[0], atol=joint_tol)
|
||||
]
|
||||
k = [
|
||||
k
|
||||
for k, t in enumerate(self.rows)
|
||||
if np.isclose(v[1], t[0], atol=joint_tol)
|
||||
]
|
||||
if not j:
|
||||
continue
|
||||
J = j[0]
|
||||
|
|
@ -434,12 +467,21 @@ class Table(object):
|
|||
for h in horizontal:
|
||||
# find closest y coord
|
||||
# iterate over x coords and find closest start and end points
|
||||
i = [i for i, t in enumerate(self.rows)
|
||||
if np.isclose(h[1], t[0], atol=joint_tol)]
|
||||
j = [j for j, t in enumerate(self.cols)
|
||||
if np.isclose(h[0], t[0], atol=joint_tol)]
|
||||
k = [k for k, t in enumerate(self.cols)
|
||||
if np.isclose(h[2], t[0], atol=joint_tol)]
|
||||
i = [
|
||||
i
|
||||
for i, t in enumerate(self.rows)
|
||||
if np.isclose(h[1], t[0], atol=joint_tol)
|
||||
]
|
||||
j = [
|
||||
j
|
||||
for j, t in enumerate(self.cols)
|
||||
if np.isclose(h[0], t[0], atol=joint_tol)
|
||||
]
|
||||
k = [
|
||||
k
|
||||
for k, t in enumerate(self.cols)
|
||||
if np.isclose(h[2], t[0], atol=joint_tol)
|
||||
]
|
||||
if not j:
|
||||
continue
|
||||
J = j[0]
|
||||
|
|
@ -537,12 +579,7 @@ class Table(object):
|
|||
Output filepath.
|
||||
|
||||
"""
|
||||
kw = {
|
||||
'encoding': 'utf-8',
|
||||
'index': False,
|
||||
'header': False,
|
||||
'quoting': 1
|
||||
}
|
||||
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
|
||||
kw.update(kwargs)
|
||||
self.df.to_csv(path, **kw)
|
||||
|
||||
|
|
@ -557,12 +594,10 @@ class Table(object):
|
|||
Output filepath.
|
||||
|
||||
"""
|
||||
kw = {
|
||||
'orient': 'records'
|
||||
}
|
||||
kw = {"orient": "records"}
|
||||
kw.update(kwargs)
|
||||
json_string = self.df.to_json(**kw)
|
||||
with open(path, 'w') as f:
|
||||
with open(path, "w") as f:
|
||||
f.write(json_string)
|
||||
|
||||
def to_excel(self, path, **kwargs):
|
||||
|
|
@ -577,8 +612,8 @@ class Table(object):
|
|||
|
||||
"""
|
||||
kw = {
|
||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||
'encoding': 'utf-8'
|
||||
"sheet_name": "page-{}-table-{}".format(self.page, self.order),
|
||||
"encoding": "utf-8",
|
||||
}
|
||||
kw.update(kwargs)
|
||||
writer = pd.ExcelWriter(path)
|
||||
|
|
@ -597,7 +632,7 @@ class Table(object):
|
|||
|
||||
"""
|
||||
html_string = self.df.to_html(**kwargs)
|
||||
with open(path, 'w') as f:
|
||||
with open(path, "w") as f:
|
||||
f.write(html_string)
|
||||
|
||||
def to_sqlite(self, path, **kwargs):
|
||||
|
|
@ -611,13 +646,10 @@ class Table(object):
|
|||
Output filepath.
|
||||
|
||||
"""
|
||||
kw = {
|
||||
'if_exists': 'replace',
|
||||
'index': False
|
||||
}
|
||||
kw = {"if_exists": "replace", "index": False}
|
||||
kw.update(kwargs)
|
||||
conn = sqlite3.connect(path)
|
||||
table_name = 'page-{}-table-{}'.format(self.page, self.order)
|
||||
table_name = "page-{}-table-{}".format(self.page, self.order)
|
||||
self.df.to_sql(table_name, conn, **kw)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
|
@ -633,12 +665,12 @@ class TableList(object):
|
|||
Number of tables in the list.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, tables):
|
||||
self._tables = tables
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} n={}>'.format(
|
||||
self.__class__.__name__, self.n)
|
||||
return "<{} n={}>".format(self.__class__.__name__, self.n)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._tables)
|
||||
|
|
@ -648,37 +680,39 @@ class TableList(object):
|
|||
|
||||
@staticmethod
|
||||
def _format_func(table, f):
|
||||
return getattr(table, 'to_{}'.format(f))
|
||||
return getattr(table, "to_{}".format(f))
|
||||
|
||||
@property
|
||||
def n(self):
|
||||
return len(self)
|
||||
|
||||
def _write_file(self, f=None, **kwargs):
|
||||
dirname = kwargs.get('dirname')
|
||||
root = kwargs.get('root')
|
||||
ext = kwargs.get('ext')
|
||||
dirname = kwargs.get("dirname")
|
||||
root = kwargs.get("root")
|
||||
ext = kwargs.get("ext")
|
||||
for table in self._tables:
|
||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||
root, table.page, table.order, ext))
|
||||
filename = os.path.join(
|
||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
||||
)
|
||||
filepath = os.path.join(dirname, filename)
|
||||
to_format = self._format_func(table, f)
|
||||
to_format(filepath)
|
||||
|
||||
def _compress_dir(self, **kwargs):
|
||||
path = kwargs.get('path')
|
||||
dirname = kwargs.get('dirname')
|
||||
root = kwargs.get('root')
|
||||
ext = kwargs.get('ext')
|
||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||
path = kwargs.get("path")
|
||||
dirname = kwargs.get("dirname")
|
||||
root = kwargs.get("root")
|
||||
ext = kwargs.get("ext")
|
||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||
for table in self._tables:
|
||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||
root, table.page, table.order, ext))
|
||||
filename = os.path.join(
|
||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
||||
)
|
||||
filepath = os.path.join(dirname, filename)
|
||||
z.write(filepath, os.path.basename(filepath))
|
||||
|
||||
def export(self, path, f='csv', compress=False):
|
||||
def export(self, path, f="csv", compress=False):
|
||||
"""Exports the list of tables to specified file format.
|
||||
|
||||
Parameters
|
||||
|
|
@ -697,33 +731,28 @@ class TableList(object):
|
|||
if compress:
|
||||
dirname = tempfile.mkdtemp()
|
||||
|
||||
kwargs = {
|
||||
'path': path,
|
||||
'dirname': dirname,
|
||||
'root': root,
|
||||
'ext': ext
|
||||
}
|
||||
kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
|
||||
|
||||
if f in ['csv', 'json', 'html']:
|
||||
if f in ["csv", "json", "html"]:
|
||||
self._write_file(f=f, **kwargs)
|
||||
if compress:
|
||||
self._compress_dir(**kwargs)
|
||||
elif f == 'excel':
|
||||
elif f == "excel":
|
||||
filepath = os.path.join(dirname, basename)
|
||||
writer = pd.ExcelWriter(filepath)
|
||||
for table in self._tables:
|
||||
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
|
||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
|
||||
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
||||
writer.save()
|
||||
if compress:
|
||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||
z.write(filepath, os.path.basename(filepath))
|
||||
elif f == 'sqlite':
|
||||
elif f == "sqlite":
|
||||
filepath = os.path.join(dirname, basename)
|
||||
for table in self._tables:
|
||||
table.to_sqlite(filepath)
|
||||
if compress:
|
||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||
z.write(filepath, os.path.basename(filepath))
|
||||
|
|
|
|||
|
|
@ -24,10 +24,10 @@ ghostscript - A Python interface for the Ghostscript interpreter C-API
|
|||
from . import _gsprint as gs
|
||||
|
||||
|
||||
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
||||
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
||||
__license__ = 'GNU General Public License version 3 (GPL v3)'
|
||||
__version__ = '0.6'
|
||||
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||
__license__ = "GNU General Public License version 3 (GPL v3)"
|
||||
__version__ = "0.6"
|
||||
|
||||
|
||||
class __Ghostscript(object):
|
||||
|
|
@ -87,10 +87,13 @@ def Ghostscript(*args, **kwargs):
|
|||
# Ghostscript only supports a single instance
|
||||
if __instance__ is None:
|
||||
__instance__ = gs.new_instance()
|
||||
return __Ghostscript(__instance__, args,
|
||||
stdin=kwargs.get('stdin', None),
|
||||
stdout=kwargs.get('stdout', None),
|
||||
stderr=kwargs.get('stderr', None))
|
||||
return __Ghostscript(
|
||||
__instance__,
|
||||
args,
|
||||
stdin=kwargs.get("stdin", None),
|
||||
stdout=kwargs.get("stdout", None),
|
||||
stderr=kwargs.get("stderr", None),
|
||||
)
|
||||
|
||||
|
||||
__instance__ = None
|
||||
|
|
|
|||
|
|
@ -42,10 +42,10 @@ e_Info = -110
|
|||
#
|
||||
e_Quit = -101
|
||||
|
||||
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
||||
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
||||
__license__ = 'GNU General Public License version 3 (GPL v3)'
|
||||
__version__ = '0.6'
|
||||
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||
__license__ = "GNU General Public License version 3 (GPL v3)"
|
||||
__version__ = "0.6"
|
||||
|
||||
gs_main_instance = c_void_p
|
||||
display_callback = c_void_p
|
||||
|
|
@ -55,7 +55,7 @@ display_callback = c_void_p
|
|||
|
||||
class GhostscriptError(Exception):
|
||||
def __init__(self, ecode):
|
||||
self.code = ecode
|
||||
self.code = ecode
|
||||
|
||||
|
||||
def new_instance():
|
||||
|
|
@ -89,6 +89,7 @@ def _wrap_stdin(infp):
|
|||
"""Wrap a filehandle into a C function to be used as `stdin` callback
|
||||
for ``set_stdio``. The filehandle has to support the readline() method.
|
||||
"""
|
||||
|
||||
def _wrap(instance, dest, count):
|
||||
try:
|
||||
data = infp.readline(count)
|
||||
|
|
@ -110,6 +111,7 @@ def _wrap_stdout(outfp):
|
|||
`stderr` callback for ``set_stdio``. The filehandle has to support the
|
||||
write() and flush() methods.
|
||||
"""
|
||||
|
||||
def _wrap(instance, str, count):
|
||||
outfp.write(str[:count])
|
||||
outfp.flush()
|
||||
|
|
@ -187,11 +189,23 @@ def __win32_finddll():
|
|||
import winreg
|
||||
except ImportError:
|
||||
# assume Python 2
|
||||
from _winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
|
||||
QueryInfoKey, HKEY_LOCAL_MACHINE
|
||||
from _winreg import (
|
||||
OpenKey,
|
||||
CloseKey,
|
||||
EnumKey,
|
||||
QueryValueEx,
|
||||
QueryInfoKey,
|
||||
HKEY_LOCAL_MACHINE,
|
||||
)
|
||||
else:
|
||||
from winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
|
||||
QueryInfoKey, HKEY_LOCAL_MACHINE
|
||||
from winreg import (
|
||||
OpenKey,
|
||||
CloseKey,
|
||||
EnumKey,
|
||||
QueryValueEx,
|
||||
QueryInfoKey,
|
||||
HKEY_LOCAL_MACHINE,
|
||||
)
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
import os
|
||||
|
|
@ -199,15 +213,19 @@ def __win32_finddll():
|
|||
dlls = []
|
||||
# Look up different variants of Ghostscript and take the highest
|
||||
# version for which the DLL is to be found in the filesystem.
|
||||
for key_name in ('AFPL Ghostscript', 'Aladdin Ghostscript',
|
||||
'GNU Ghostscript', 'GPL Ghostscript'):
|
||||
for key_name in (
|
||||
"AFPL Ghostscript",
|
||||
"Aladdin Ghostscript",
|
||||
"GNU Ghostscript",
|
||||
"GPL Ghostscript",
|
||||
):
|
||||
try:
|
||||
k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name)
|
||||
for num in range(0, QueryInfoKey(k1)[0]):
|
||||
version = EnumKey(k1, num)
|
||||
try:
|
||||
k2 = OpenKey(k1, version)
|
||||
dll_path = QueryValueEx(k2, 'GS_DLL')[0]
|
||||
dll_path = QueryValueEx(k2, "GS_DLL")[0]
|
||||
CloseKey(k2)
|
||||
if os.path.exists(dll_path):
|
||||
dlls.append((LooseVersion(version), dll_path))
|
||||
|
|
@ -223,21 +241,21 @@ def __win32_finddll():
|
|||
return None
|
||||
|
||||
|
||||
if sys.platform == 'win32':
|
||||
if sys.platform == "win32":
|
||||
libgs = __win32_finddll()
|
||||
if not libgs:
|
||||
raise RuntimeError('Please make sure that Ghostscript is installed')
|
||||
raise RuntimeError("Please make sure that Ghostscript is installed")
|
||||
libgs = windll.LoadLibrary(libgs)
|
||||
else:
|
||||
try:
|
||||
libgs = cdll.LoadLibrary('libgs.so')
|
||||
libgs = cdll.LoadLibrary("libgs.so")
|
||||
except OSError:
|
||||
# shared object file not found
|
||||
import ctypes.util
|
||||
|
||||
libgs = ctypes.util.find_library('gs')
|
||||
libgs = ctypes.util.find_library("gs")
|
||||
if not libgs:
|
||||
raise RuntimeError('Please make sure that Ghostscript is installed')
|
||||
raise RuntimeError("Please make sure that Ghostscript is installed")
|
||||
libgs = cdll.LoadLibrary(libgs)
|
||||
|
||||
del __win32_finddll
|
||||
|
|
|
|||
|
|
@ -7,8 +7,14 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
|||
|
||||
from .core import TableList
|
||||
from .parsers import Stream, Lattice
|
||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||
get_rotation, is_url, download_url)
|
||||
from .utils import (
|
||||
TemporaryDirectory,
|
||||
get_page_layout,
|
||||
get_text_objects,
|
||||
get_rotation,
|
||||
is_url,
|
||||
download_url,
|
||||
)
|
||||
|
||||
|
||||
class PDFHandler(object):
|
||||
|
|
@ -27,19 +33,20 @@ class PDFHandler(object):
|
|||
Password for decryption.
|
||||
|
||||
"""
|
||||
def __init__(self, filepath, pages='1', password=None):
|
||||
|
||||
def __init__(self, filepath, pages="1", password=None):
|
||||
if is_url(filepath):
|
||||
filepath = download_url(filepath)
|
||||
self.filepath = filepath
|
||||
if not filepath.lower().endswith('.pdf'):
|
||||
if not filepath.lower().endswith(".pdf"):
|
||||
raise NotImplementedError("File format not supported")
|
||||
|
||||
if password is None:
|
||||
self.password = ''
|
||||
self.password = ""
|
||||
else:
|
||||
self.password = password
|
||||
if sys.version_info[0] < 3:
|
||||
self.password = self.password.encode('ascii')
|
||||
self.password = self.password.encode("ascii")
|
||||
self.pages = self._get_pages(self.filepath, pages)
|
||||
|
||||
def _get_pages(self, filepath, pages):
|
||||
|
|
@ -60,26 +67,26 @@ class PDFHandler(object):
|
|||
|
||||
"""
|
||||
page_numbers = []
|
||||
if pages == '1':
|
||||
page_numbers.append({'start': 1, 'end': 1})
|
||||
if pages == "1":
|
||||
page_numbers.append({"start": 1, "end": 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
|
||||
infile = PdfFileReader(open(filepath, "rb"), strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
if pages == 'all':
|
||||
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
||||
if pages == "all":
|
||||
page_numbers.append({"start": 1, "end": infile.getNumPages()})
|
||||
else:
|
||||
for r in pages.split(','):
|
||||
if '-' in r:
|
||||
a, b = r.split('-')
|
||||
if b == 'end':
|
||||
for r in pages.split(","):
|
||||
if "-" in r:
|
||||
a, b = r.split("-")
|
||||
if b == "end":
|
||||
b = infile.getNumPages()
|
||||
page_numbers.append({'start': int(a), 'end': int(b)})
|
||||
page_numbers.append({"start": int(a), "end": int(b)})
|
||||
else:
|
||||
page_numbers.append({'start': int(r), 'end': int(r)})
|
||||
page_numbers.append({"start": int(r), "end": int(r)})
|
||||
P = []
|
||||
for p in page_numbers:
|
||||
P.extend(range(p['start'], p['end'] + 1))
|
||||
P.extend(range(p["start"], p["end"] + 1))
|
||||
return sorted(set(P))
|
||||
|
||||
def _save_page(self, filepath, page, temp):
|
||||
|
|
@ -95,16 +102,16 @@ class PDFHandler(object):
|
|||
Tmp directory.
|
||||
|
||||
"""
|
||||
with open(filepath, 'rb') as fileobj:
|
||||
with open(filepath, "rb") as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
||||
froot, fext = os.path.splitext(fpath)
|
||||
p = infile.getPage(page - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
with open(fpath, "wb") as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(fpath)
|
||||
# fix rotated PDF
|
||||
|
|
@ -112,23 +119,25 @@ class PDFHandler(object):
|
|||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||
if rotation != '':
|
||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||
if rotation != "":
|
||||
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
|
||||
os.rename(fpath, fpath_new)
|
||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
outfile = PdfFileWriter()
|
||||
p = infile.getPage(0)
|
||||
if rotation == 'anticlockwise':
|
||||
if rotation == "anticlockwise":
|
||||
p.rotateClockwise(90)
|
||||
elif rotation == 'clockwise':
|
||||
elif rotation == "clockwise":
|
||||
p.rotateCounterClockwise(90)
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
with open(fpath, "wb") as f:
|
||||
outfile.write(f)
|
||||
|
||||
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||
def parse(
|
||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
||||
):
|
||||
"""Extracts tables by calling parser.get_tables on all single
|
||||
page PDFs.
|
||||
|
||||
|
|
@ -154,11 +163,13 @@ class PDFHandler(object):
|
|||
with TemporaryDirectory() as tempdir:
|
||||
for p in self.pages:
|
||||
self._save_page(self.filepath, p, tempdir)
|
||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pages]
|
||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||
pages = [
|
||||
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
||||
]
|
||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||
for p in pages:
|
||||
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
|
||||
layout_kwargs=layout_kwargs)
|
||||
t = parser.extract_tables(
|
||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
||||
)
|
||||
tables.extend(t)
|
||||
return TableList(sorted(tables))
|
||||
|
|
|
|||
|
|
@ -39,17 +39,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
|
||||
if process_background:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, blocksize, c)
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||
)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
np.invert(gray), 255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
|
||||
np.invert(gray),
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
blocksize,
|
||||
c,
|
||||
)
|
||||
return img, threshold
|
||||
|
||||
|
||||
def find_lines(threshold, regions=None, direction='horizontal',
|
||||
line_scale=15, iterations=0):
|
||||
def find_lines(
|
||||
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
||||
):
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
||||
|
|
@ -87,15 +93,14 @@ def find_lines(threshold, regions=None, direction='horizontal',
|
|||
"""
|
||||
lines = []
|
||||
|
||||
if direction == 'vertical':
|
||||
if direction == "vertical":
|
||||
size = threshold.shape[0] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
elif direction == 'horizontal':
|
||||
elif direction == "horizontal":
|
||||
size = threshold.shape[1] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError("Specify direction as either 'vertical' or"
|
||||
" 'horizontal'")
|
||||
raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'")
|
||||
|
||||
if regions is not None:
|
||||
region_mask = np.zeros(threshold.shape)
|
||||
|
|
@ -110,19 +115,21 @@ def find_lines(threshold, regions=None, direction='horizontal',
|
|||
|
||||
try:
|
||||
_, contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
for c in contours:
|
||||
x, y, w, h = cv2.boundingRect(c)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
if direction == 'vertical':
|
||||
if direction == "vertical":
|
||||
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
|
||||
elif direction == 'horizontal':
|
||||
elif direction == "horizontal":
|
||||
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
|
||||
|
||||
return dmask, lines
|
||||
|
|
@ -150,11 +157,13 @@ def find_contours(vertical, horizontal):
|
|||
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, __ = cv2.findContours(
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
# sort in reverse based on contour area and use first 10 contours
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
|
|
@ -196,11 +205,13 @@ def find_joints(contours, vertical, horizontal):
|
|||
roi = joints[y : y + h, x : x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
jc, __ = cv2.findContours(
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
if len(jc) <= 4: # remove contours with less than 4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
|
|
|
|||
|
|
@ -6,8 +6,15 @@ from .handlers import PDFHandler
|
|||
from .utils import validate_input, remove_extra
|
||||
|
||||
|
||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||
suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||
def read_pdf(
|
||||
filepath,
|
||||
pages="1",
|
||||
password=None,
|
||||
flavor="lattice",
|
||||
suppress_stdout=False,
|
||||
layout_kwargs={},
|
||||
**kwargs
|
||||
):
|
||||
"""Read PDF and return extracted tables.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||
|
|
@ -91,9 +98,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
|||
tables : camelot.core.TableList
|
||||
|
||||
"""
|
||||
if flavor not in ['lattice', 'stream']:
|
||||
raise NotImplementedError("Unknown flavor specified."
|
||||
" Use either 'lattice' or 'stream'")
|
||||
if flavor not in ["lattice", "stream"]:
|
||||
raise NotImplementedError(
|
||||
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
if suppress_stdout:
|
||||
|
|
@ -102,6 +110,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
|||
validate_input(kwargs, flavor=flavor)
|
||||
p = PDFHandler(filepath, pages=pages, password=password)
|
||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
|
||||
layout_kwargs=layout_kwargs, **kwargs)
|
||||
tables = p.parse(
|
||||
flavor=flavor,
|
||||
suppress_stdout=suppress_stdout,
|
||||
layout_kwargs=layout_kwargs,
|
||||
**kwargs
|
||||
)
|
||||
return tables
|
||||
|
|
|
|||
|
|
@ -8,13 +8,13 @@ from ..utils import get_page_layout, get_text_objects
|
|||
class BaseParser(object):
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
|
||||
def _generate_layout(self, filename, layout_kwargs):
|
||||
self.filename = filename
|
||||
self.layout_kwargs = layout_kwargs
|
||||
self.layout, self.dimensions = get_page_layout(
|
||||
filename, **layout_kwargs)
|
||||
self.images = get_text_objects(self.layout, ltype='image')
|
||||
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
|
||||
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
|
||||
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
|
||||
self.images = get_text_objects(self.layout, ltype="image")
|
||||
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
|
||||
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
|
||||
self.pdf_width, self.pdf_height = self.dimensions
|
||||
self.rootname, __ = os.path.splitext(self.filename)
|
||||
|
|
|
|||
|
|
@ -14,14 +14,25 @@ import pandas as pd
|
|||
|
||||
from .base import BaseParser
|
||||
from ..core import Table
|
||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||
merge_close_lines, get_table_index, compute_accuracy,
|
||||
compute_whitespace)
|
||||
from ..image_processing import (adaptive_threshold, find_lines,
|
||||
find_contours, find_joints)
|
||||
from ..utils import (
|
||||
scale_image,
|
||||
scale_pdf,
|
||||
segments_in_bbox,
|
||||
text_in_bbox,
|
||||
merge_close_lines,
|
||||
get_table_index,
|
||||
compute_accuracy,
|
||||
compute_whitespace,
|
||||
)
|
||||
from ..image_processing import (
|
||||
adaptive_threshold,
|
||||
find_lines,
|
||||
find_contours,
|
||||
find_joints,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger('camelot')
|
||||
logger = logging.getLogger("camelot")
|
||||
|
||||
|
||||
class Lattice(BaseParser):
|
||||
|
|
@ -83,11 +94,26 @@ class Lattice(BaseParser):
|
|||
Resolution used for PDF to PNG conversion.
|
||||
|
||||
"""
|
||||
def __init__(self, table_regions=None, table_areas=None, process_background=False,
|
||||
line_scale=15, copy_text=None, shift_text=['l', 't'],
|
||||
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
||||
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||
iterations=0, resolution=300, **kwargs):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
process_background=False,
|
||||
line_scale=15,
|
||||
copy_text=None,
|
||||
shift_text=["l", "t"],
|
||||
split_text=False,
|
||||
flag_size=False,
|
||||
strip_text="",
|
||||
line_tol=2,
|
||||
joint_tol=2,
|
||||
threshold_blocksize=15,
|
||||
threshold_constant=-2,
|
||||
iterations=0,
|
||||
resolution=300,
|
||||
**kwargs
|
||||
):
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.process_background = process_background
|
||||
|
|
@ -130,19 +156,19 @@ class Lattice(BaseParser):
|
|||
indices = []
|
||||
for r_idx, c_idx, text in idx:
|
||||
for d in shift_text:
|
||||
if d == 'l':
|
||||
if d == "l":
|
||||
if t.cells[r_idx][c_idx].hspan:
|
||||
while not t.cells[r_idx][c_idx].left:
|
||||
c_idx -= 1
|
||||
if d == 'r':
|
||||
if d == "r":
|
||||
if t.cells[r_idx][c_idx].hspan:
|
||||
while not t.cells[r_idx][c_idx].right:
|
||||
c_idx += 1
|
||||
if d == 't':
|
||||
if d == "t":
|
||||
if t.cells[r_idx][c_idx].vspan:
|
||||
while not t.cells[r_idx][c_idx].top:
|
||||
r_idx -= 1
|
||||
if d == 'b':
|
||||
if d == "b":
|
||||
if t.cells[r_idx][c_idx].vspan:
|
||||
while not t.cells[r_idx][c_idx].bottom:
|
||||
r_idx += 1
|
||||
|
|
@ -171,13 +197,13 @@ class Lattice(BaseParser):
|
|||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == '':
|
||||
if t.cells[i][j].text.strip() == "":
|
||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
||||
elif f == "v":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == '':
|
||||
if t.cells[i][j].text.strip() == "":
|
||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||
return t
|
||||
|
|
@ -185,11 +211,12 @@ class Lattice(BaseParser):
|
|||
def _generate_image(self):
|
||||
from ..ext.ghostscript import Ghostscript
|
||||
|
||||
self.imagename = ''.join([self.rootname, '.png'])
|
||||
gs_call = '-q -sDEVICE=png16m -o {} -r300 {}'.format(
|
||||
self.imagename, self.filename)
|
||||
self.imagename = "".join([self.rootname, ".png"])
|
||||
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
||||
self.imagename, self.filename
|
||||
)
|
||||
gs_call = gs_call.encode().split()
|
||||
null = open(os.devnull, 'wb')
|
||||
null = open(os.devnull, "wb")
|
||||
with Ghostscript(*gs_call, stdout=null) as gs:
|
||||
pass
|
||||
null.close()
|
||||
|
|
@ -208,8 +235,11 @@ class Lattice(BaseParser):
|
|||
return scaled_areas
|
||||
|
||||
self.image, self.threshold = adaptive_threshold(
|
||||
self.imagename, process_background=self.process_background,
|
||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||
self.imagename,
|
||||
process_background=self.process_background,
|
||||
blocksize=self.threshold_blocksize,
|
||||
c=self.threshold_constant,
|
||||
)
|
||||
|
||||
image_width = self.image.shape[1]
|
||||
image_height = self.image.shape[0]
|
||||
|
|
@ -226,21 +256,35 @@ class Lattice(BaseParser):
|
|||
regions = scale_areas(self.table_regions)
|
||||
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, regions=regions, direction='vertical',
|
||||
line_scale=self.line_scale, iterations=self.iterations)
|
||||
self.threshold,
|
||||
regions=regions,
|
||||
direction="vertical",
|
||||
line_scale=self.line_scale,
|
||||
iterations=self.iterations,
|
||||
)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, regions=regions, direction='horizontal',
|
||||
line_scale=self.line_scale, iterations=self.iterations)
|
||||
self.threshold,
|
||||
regions=regions,
|
||||
direction="horizontal",
|
||||
line_scale=self.line_scale,
|
||||
iterations=self.iterations,
|
||||
)
|
||||
|
||||
contours = find_contours(vertical_mask, horizontal_mask)
|
||||
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
||||
else:
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, direction='vertical', line_scale=self.line_scale,
|
||||
iterations=self.iterations)
|
||||
self.threshold,
|
||||
direction="vertical",
|
||||
line_scale=self.line_scale,
|
||||
iterations=self.iterations,
|
||||
)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, direction='horizontal', line_scale=self.line_scale,
|
||||
iterations=self.iterations)
|
||||
self.threshold,
|
||||
direction="horizontal",
|
||||
line_scale=self.line_scale,
|
||||
iterations=self.iterations,
|
||||
)
|
||||
|
||||
areas = scale_areas(self.table_areas)
|
||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||
|
|
@ -248,18 +292,20 @@ class Lattice(BaseParser):
|
|||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||
|
||||
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers)
|
||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||
)
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
# select elements which lie within table_bbox
|
||||
t_bbox = {}
|
||||
v_s, h_s = segments_in_bbox(
|
||||
tk, self.vertical_segments, self.horizontal_segments)
|
||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
||||
tk, self.vertical_segments, self.horizontal_segments
|
||||
)
|
||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
||||
|
||||
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
|
||||
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
|
||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||
|
||||
self.t_bbox = t_bbox
|
||||
|
||||
|
|
@ -268,23 +314,19 @@ class Lattice(BaseParser):
|
|||
cols.extend([tk[0], tk[2]])
|
||||
rows.extend([tk[1], tk[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_lines(
|
||||
sorted(cols), line_tol=self.line_tol)
|
||||
rows = merge_close_lines(
|
||||
sorted(rows, reverse=True), line_tol=self.line_tol)
|
||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||
|
||||
return cols, rows, v_s, h_s
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
v_s = kwargs.get('v_s')
|
||||
h_s = kwargs.get('h_s')
|
||||
v_s = kwargs.get("v_s")
|
||||
h_s = kwargs.get("h_s")
|
||||
if v_s is None or h_s is None:
|
||||
raise ValueError('No segments found on {}'.format(self.rootname))
|
||||
raise ValueError("No segments found on {}".format(self.rootname))
|
||||
|
||||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
|
|
@ -297,14 +339,21 @@ class Lattice(BaseParser):
|
|||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
for direction in ['vertical', 'horizontal']:
|
||||
for direction in ["vertical", "horizontal"]:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size, strip_text=self.strip_text)
|
||||
table,
|
||||
t,
|
||||
direction,
|
||||
split_text=self.split_text,
|
||||
flag_size=self.flag_size,
|
||||
strip_text=self.strip_text,
|
||||
)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
||||
indices = Lattice._reduce_index(
|
||||
table, indices, shift_text=self.shift_text
|
||||
)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
|
@ -317,11 +366,11 @@ class Lattice(BaseParser):
|
|||
table.shape = table.df.shape
|
||||
|
||||
whitespace = compute_whitespace(data)
|
||||
table.flavor = 'lattice'
|
||||
table.flavor = "lattice"
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
||||
|
||||
# for plotting
|
||||
_text = []
|
||||
|
|
@ -337,15 +386,18 @@ class Lattice(BaseParser):
|
|||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||
self._generate_layout(filename, layout_kwargs)
|
||||
if not suppress_stdout:
|
||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
||||
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn('{} is image-based, camelot only works on'
|
||||
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||
warnings.warn(
|
||||
"{} is image-based, camelot only works on"
|
||||
" text-based pages.".format(os.path.basename(self.rootname))
|
||||
)
|
||||
else:
|
||||
warnings.warn('No tables found on {}'.format(
|
||||
os.path.basename(self.rootname)))
|
||||
warnings.warn(
|
||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
||||
)
|
||||
return []
|
||||
|
||||
self._generate_image()
|
||||
|
|
@ -353,8 +405,9 @@ class Lattice(BaseParser):
|
|||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, tk in enumerate(sorted(
|
||||
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
for table_idx, tk in enumerate(
|
||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||
):
|
||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||
table._bbox = tk
|
||||
|
|
|
|||
|
|
@ -10,11 +10,10 @@ import pandas as pd
|
|||
|
||||
from .base import BaseParser
|
||||
from ..core import TextEdges, Table
|
||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||
compute_whitespace)
|
||||
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
|
||||
|
||||
|
||||
logger = logging.getLogger('camelot')
|
||||
logger = logging.getLogger("camelot")
|
||||
|
||||
|
||||
class Stream(BaseParser):
|
||||
|
|
@ -55,9 +54,20 @@ class Stream(BaseParser):
|
|||
to generate columns.
|
||||
|
||||
"""
|
||||
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
|
||||
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
||||
column_tol=0, **kwargs):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
split_text=False,
|
||||
flag_size=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
**kwargs
|
||||
):
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.columns = columns
|
||||
|
|
@ -150,8 +160,9 @@ class Stream(BaseParser):
|
|||
else:
|
||||
lower = merged[-1]
|
||||
if column_tol >= 0:
|
||||
if (higher[0] <= lower[1] or
|
||||
np.isclose(higher[0], lower[1], atol=column_tol)):
|
||||
if higher[0] <= lower[1] or np.isclose(
|
||||
higher[0], lower[1], atol=column_tol
|
||||
):
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
|
|
@ -186,13 +197,14 @@ class Stream(BaseParser):
|
|||
List of continuous row y-coordinate tuples.
|
||||
|
||||
"""
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
row_mids = [
|
||||
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
|
||||
for r in rows_grouped
|
||||
]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
rows.insert(0, text_y_max)
|
||||
rows.append(text_y_min)
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -217,8 +229,9 @@ class Stream(BaseParser):
|
|||
if text:
|
||||
text = Stream._group_rows(text, row_tol=row_tol)
|
||||
elements = [len(r) for r in text]
|
||||
new_cols = [(t.x0, t.x1)
|
||||
for r in text if len(r) == max(elements) for t in r]
|
||||
new_cols = [
|
||||
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
|
||||
]
|
||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
||||
return cols
|
||||
|
||||
|
|
@ -243,15 +256,13 @@ class Stream(BaseParser):
|
|||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
return cols
|
||||
|
||||
def _validate_columns(self):
|
||||
if self.table_areas is not None and self.columns is not None:
|
||||
if len(self.table_areas) != len(self.columns):
|
||||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
raise ValueError("Length of table_areas and columns" " should be equal")
|
||||
|
||||
def _nurminen_table_detection(self, textlines):
|
||||
"""A general implementation of the table detection algorithm
|
||||
|
|
@ -309,16 +320,16 @@ class Stream(BaseParser):
|
|||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
# select elements which lie within table_bbox
|
||||
t_bbox = {}
|
||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
||||
|
||||
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
|
||||
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
|
||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||
|
||||
self.t_bbox = t_bbox
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
||||
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
|
||||
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
|
|
@ -327,7 +338,7 @@ class Stream(BaseParser):
|
|||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(',')
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
|
|
@ -346,20 +357,29 @@ class Stream(BaseParser):
|
|||
if len(elements):
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn("No tables found in table area {}".format(
|
||||
table_idx + 1))
|
||||
warnings.warn(
|
||||
"No tables found in table area {}".format(table_idx + 1)
|
||||
)
|
||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(
|
||||
[
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right
|
||||
]
|
||||
)
|
||||
outer_text = [
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||
]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
|
@ -373,11 +393,16 @@ class Stream(BaseParser):
|
|||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
for direction in ['vertical', 'horizontal']:
|
||||
for direction in ["vertical", "horizontal"]:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size, strip_text=self.strip_text)
|
||||
table,
|
||||
t,
|
||||
direction,
|
||||
split_text=self.split_text,
|
||||
flag_size=self.flag_size,
|
||||
strip_text=self.strip_text,
|
||||
)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
for r_idx, c_idx, text in indices:
|
||||
|
|
@ -389,11 +414,11 @@ class Stream(BaseParser):
|
|||
table.shape = table.df.shape
|
||||
|
||||
whitespace = compute_whitespace(data)
|
||||
table.flavor = 'stream'
|
||||
table.flavor = "stream"
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
||||
|
||||
# for plotting
|
||||
_text = []
|
||||
|
|
@ -409,23 +434,27 @@ class Stream(BaseParser):
|
|||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||
self._generate_layout(filename, layout_kwargs)
|
||||
if not suppress_stdout:
|
||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
||||
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn('{} is image-based, camelot only works on'
|
||||
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||
warnings.warn(
|
||||
"{} is image-based, camelot only works on"
|
||||
" text-based pages.".format(os.path.basename(self.rootname))
|
||||
)
|
||||
else:
|
||||
warnings.warn('No tables found on {}'.format(
|
||||
os.path.basename(self.rootname)))
|
||||
warnings.warn(
|
||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
||||
)
|
||||
return []
|
||||
|
||||
self._generate_table_bbox()
|
||||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, tk in enumerate(sorted(
|
||||
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
for table_idx, tk in enumerate(
|
||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||
):
|
||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
||||
table = self._generate_table(table_idx, cols, rows)
|
||||
table._bbox = tk
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ else:
|
|||
|
||||
|
||||
class PlotMethods(object):
|
||||
def __call__(self, table, kind='text', filename=None):
|
||||
def __call__(self, table, kind="text", filename=None):
|
||||
"""Plot elements found on PDF page based on kind
|
||||
specified, useful for debugging and playing with different
|
||||
parameters to get the best output.
|
||||
|
|
@ -31,14 +31,16 @@ class PlotMethods(object):
|
|||
|
||||
"""
|
||||
if not _HAS_MPL:
|
||||
raise ImportError('matplotlib is required for plotting.')
|
||||
raise ImportError("matplotlib is required for plotting.")
|
||||
|
||||
if table.flavor == 'lattice' and kind in ['textedge']:
|
||||
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
|
||||
kind))
|
||||
elif table.flavor == 'stream' and kind in ['joint', 'line']:
|
||||
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
||||
kind))
|
||||
if table.flavor == "lattice" and kind in ["textedge"]:
|
||||
raise NotImplementedError(
|
||||
"Lattice flavor does not support kind='{}'".format(kind)
|
||||
)
|
||||
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
||||
raise NotImplementedError(
|
||||
"Stream flavor does not support kind='{}'".format(kind)
|
||||
)
|
||||
|
||||
plot_method = getattr(self, kind)
|
||||
return plot_method(table)
|
||||
|
|
@ -57,18 +59,12 @@ class PlotMethods(object):
|
|||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1]
|
||||
)
|
||||
)
|
||||
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
return fig
|
||||
|
|
@ -87,21 +83,17 @@ class PlotMethods(object):
|
|||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
for row in table.cells:
|
||||
for cell in row:
|
||||
if cell.left:
|
||||
ax.plot([cell.lb[0], cell.lt[0]],
|
||||
[cell.lb[1], cell.lt[1]])
|
||||
ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
|
||||
if cell.right:
|
||||
ax.plot([cell.rb[0], cell.rt[0]],
|
||||
[cell.rb[1], cell.rt[1]])
|
||||
ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
|
||||
if cell.top:
|
||||
ax.plot([cell.lt[0], cell.rt[0]],
|
||||
[cell.lt[1], cell.rt[1]])
|
||||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||
if cell.bottom:
|
||||
ax.plot([cell.lb[0], cell.rb[0]],
|
||||
[cell.lb[1], cell.rb[1]])
|
||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||
return fig
|
||||
|
||||
def contour(self, table):
|
||||
|
|
@ -124,7 +116,7 @@ class PlotMethods(object):
|
|||
img, table_bbox = (None, {table._bbox: None})
|
||||
_FOR_LATTICE = False
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
|
||||
xs, ys = [], []
|
||||
if not _FOR_LATTICE:
|
||||
|
|
@ -133,21 +125,14 @@ class PlotMethods(object):
|
|||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
color='blue'
|
||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
|
||||
)
|
||||
)
|
||||
|
||||
for t in table_bbox.keys():
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
fill=False,
|
||||
color='red'
|
||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
|
||||
)
|
||||
)
|
||||
if not _FOR_LATTICE:
|
||||
|
|
@ -173,25 +158,19 @@ class PlotMethods(object):
|
|||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
color='blue'
|
||||
)
|
||||
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
|
||||
for te in table._textedges:
|
||||
ax.plot([te.x, te.x],
|
||||
[te.y0, te.y1])
|
||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||
|
||||
return fig
|
||||
|
||||
|
|
@ -210,14 +189,14 @@ class PlotMethods(object):
|
|||
"""
|
||||
img, table_bbox = table._image
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
ax.plot(x_coord, y_coord, 'ro')
|
||||
ax.plot(x_coord, y_coord, "ro")
|
||||
ax.imshow(img)
|
||||
return fig
|
||||
|
||||
|
|
@ -235,7 +214,7 @@ class PlotMethods(object):
|
|||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
vertical, horizontal = table._segments
|
||||
for v in vertical:
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
|
|
|
|||
271
camelot/utils.py
271
camelot/utils.py
|
|
@ -19,8 +19,14 @@ from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||
LTTextLineVertical, LTImage)
|
||||
from pdfminer.layout import (
|
||||
LAParams,
|
||||
LTAnno,
|
||||
LTChar,
|
||||
LTTextLineHorizontal,
|
||||
LTTextLineVertical,
|
||||
LTImage,
|
||||
)
|
||||
|
||||
|
||||
PY3 = sys.version_info[0] >= 3
|
||||
|
|
@ -35,7 +41,7 @@ else:
|
|||
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard('')
|
||||
_VALID_URLS.discard("")
|
||||
|
||||
|
||||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||
|
|
@ -59,9 +65,11 @@ def is_url(url):
|
|||
|
||||
|
||||
def random_string(length):
|
||||
ret = ''
|
||||
ret = ""
|
||||
while length:
|
||||
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
|
||||
ret += random.choice(
|
||||
string.digits + string.ascii_lowercase + string.ascii_uppercase
|
||||
)
|
||||
length -= 1
|
||||
return ret
|
||||
|
||||
|
|
@ -79,14 +87,14 @@ def download_url(url):
|
|||
Temporary filepath.
|
||||
|
||||
"""
|
||||
filename = '{}.pdf'.format(random_string(6))
|
||||
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
|
||||
filename = "{}.pdf".format(random_string(6))
|
||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||
obj = urlopen(url)
|
||||
if PY3:
|
||||
content_type = obj.info().get_content_type()
|
||||
else:
|
||||
content_type = obj.info().getheader('Content-Type')
|
||||
if content_type != 'application/pdf':
|
||||
content_type = obj.info().getheader("Content-Type")
|
||||
if content_type != "application/pdf":
|
||||
raise NotImplementedError("File format not supported")
|
||||
f.write(obj.read())
|
||||
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||
|
|
@ -94,39 +102,38 @@ def download_url(url):
|
|||
return filepath
|
||||
|
||||
|
||||
stream_kwargs = [
|
||||
'columns',
|
||||
'row_tol',
|
||||
'column_tol'
|
||||
]
|
||||
stream_kwargs = ["columns", "row_tol", "column_tol"]
|
||||
lattice_kwargs = [
|
||||
'process_background',
|
||||
'line_scale',
|
||||
'copy_text',
|
||||
'shift_text',
|
||||
'line_tol',
|
||||
'joint_tol',
|
||||
'threshold_blocksize',
|
||||
'threshold_constant',
|
||||
'iterations'
|
||||
"process_background",
|
||||
"line_scale",
|
||||
"copy_text",
|
||||
"shift_text",
|
||||
"line_tol",
|
||||
"joint_tol",
|
||||
"threshold_blocksize",
|
||||
"threshold_constant",
|
||||
"iterations",
|
||||
]
|
||||
|
||||
|
||||
def validate_input(kwargs, flavor='lattice'):
|
||||
def validate_input(kwargs, flavor="lattice"):
|
||||
def check_intersection(parser_kwargs, input_kwargs):
|
||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||
if isec:
|
||||
raise ValueError("{} cannot be used with flavor='{}'".format(
|
||||
",".join(sorted(isec)), flavor))
|
||||
raise ValueError(
|
||||
"{} cannot be used with flavor='{}'".format(
|
||||
",".join(sorted(isec)), flavor
|
||||
)
|
||||
)
|
||||
|
||||
if flavor == 'lattice':
|
||||
if flavor == "lattice":
|
||||
check_intersection(stream_kwargs, kwargs)
|
||||
else:
|
||||
check_intersection(lattice_kwargs, kwargs)
|
||||
|
||||
|
||||
def remove_extra(kwargs, flavor='lattice'):
|
||||
if flavor == 'lattice':
|
||||
def remove_extra(kwargs, flavor="lattice"):
|
||||
if flavor == "lattice":
|
||||
for key in kwargs.keys():
|
||||
if key in stream_kwargs:
|
||||
kwargs.pop(key)
|
||||
|
|
@ -256,15 +263,19 @@ def scale_image(tables, v_segments, h_segments, factors):
|
|||
v_segments_new = []
|
||||
for v in v_segments:
|
||||
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
|
||||
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
|
||||
abs(translate(-img_y, v[3])), scaling_factor_y)
|
||||
y1, y2 = (
|
||||
scale(abs(translate(-img_y, v[1])), scaling_factor_y),
|
||||
scale(abs(translate(-img_y, v[3])), scaling_factor_y),
|
||||
)
|
||||
v_segments_new.append((x1, y1, x2, y2))
|
||||
|
||||
h_segments_new = []
|
||||
for h in h_segments:
|
||||
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
|
||||
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
|
||||
abs(translate(-img_y, h[3])), scaling_factor_y)
|
||||
y1, y2 = (
|
||||
scale(abs(translate(-img_y, h[1])), scaling_factor_y),
|
||||
scale(abs(translate(-img_y, h[3])), scaling_factor_y),
|
||||
)
|
||||
h_segments_new.append((x1, y1, x2, y2))
|
||||
|
||||
return tables_new, v_segments_new, h_segments_new
|
||||
|
|
@ -291,13 +302,13 @@ def get_rotation(chars, horizontal_text, vertical_text):
|
|||
rotated 90 degree clockwise.
|
||||
|
||||
"""
|
||||
rotation = ''
|
||||
rotation = ""
|
||||
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||
if hlen < vlen:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
||||
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
||||
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
|
||||
return rotation
|
||||
|
||||
|
||||
|
|
@ -325,10 +336,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
|||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
rt = (bbox[2], bbox[3])
|
||||
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
||||
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
|
||||
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
|
||||
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
||||
v_s = [
|
||||
v
|
||||
for v in v_segments
|
||||
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
|
||||
]
|
||||
h_s = [
|
||||
h
|
||||
for h in h_segments
|
||||
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
|
||||
]
|
||||
return v_s, h_s
|
||||
|
||||
|
||||
|
|
@ -351,9 +368,12 @@ def text_in_bbox(bbox, text):
|
|||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
rt = (bbox[2], bbox[3])
|
||||
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
||||
<= rt[1] + 2]
|
||||
t_bbox = [
|
||||
t
|
||||
for t in text
|
||||
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
|
||||
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
|
||||
]
|
||||
return t_bbox
|
||||
|
||||
|
||||
|
|
@ -390,7 +410,7 @@ def merge_close_lines(ar, line_tol=2):
|
|||
# (inspired from sklearn.pipeline.Pipeline)
|
||||
|
||||
|
||||
def flag_font_size(textline, direction, strip_text=''):
|
||||
def flag_font_size(textline, direction, strip_text=""):
|
||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||
May give false positives.
|
||||
|
||||
|
|
@ -409,10 +429,18 @@ def flag_font_size(textline, direction, strip_text=''):
|
|||
fstring : string
|
||||
|
||||
"""
|
||||
if direction == 'horizontal':
|
||||
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||
elif direction == 'vertical':
|
||||
d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||
if direction == "horizontal":
|
||||
d = [
|
||||
(t.get_text(), np.round(t.height, decimals=6))
|
||||
for t in textline
|
||||
if not isinstance(t, LTAnno)
|
||||
]
|
||||
elif direction == "vertical":
|
||||
d = [
|
||||
(t.get_text(), np.round(t.width, decimals=6))
|
||||
for t in textline
|
||||
if not isinstance(t, LTAnno)
|
||||
]
|
||||
l = [np.round(size, decimals=6) for text, size in d]
|
||||
if len(set(l)) > 1:
|
||||
flist = []
|
||||
|
|
@ -420,21 +448,21 @@ def flag_font_size(textline, direction, strip_text=''):
|
|||
for key, chars in groupby(d, itemgetter(1)):
|
||||
if key == min_size:
|
||||
fchars = [t[0] for t in chars]
|
||||
if ''.join(fchars).strip():
|
||||
fchars.insert(0, '<s>')
|
||||
fchars.append('</s>')
|
||||
flist.append(''.join(fchars))
|
||||
if "".join(fchars).strip():
|
||||
fchars.insert(0, "<s>")
|
||||
fchars.append("</s>")
|
||||
flist.append("".join(fchars))
|
||||
else:
|
||||
fchars = [t[0] for t in chars]
|
||||
if ''.join(fchars).strip():
|
||||
flist.append(''.join(fchars))
|
||||
fstring = ''.join(flist).strip(strip_text)
|
||||
if "".join(fchars).strip():
|
||||
flist.append("".join(fchars))
|
||||
fstring = "".join(flist).strip(strip_text)
|
||||
else:
|
||||
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
|
||||
fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
|
||||
return fstring
|
||||
|
||||
|
||||
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||
multiple rows/columns.
|
||||
|
||||
|
|
@ -464,19 +492,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
|||
cut_text = []
|
||||
bbox = textline.bbox
|
||||
try:
|
||||
if direction == 'horizontal' and not textline.is_empty():
|
||||
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
|
||||
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
|
||||
if direction == "horizontal" and not textline.is_empty():
|
||||
x_overlap = [
|
||||
i
|
||||
for i, x in enumerate(table.cols)
|
||||
if x[0] <= bbox[2] and bbox[0] <= x[1]
|
||||
]
|
||||
r_idx = [
|
||||
j
|
||||
for j, r in enumerate(table.rows)
|
||||
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
|
||||
]
|
||||
r = r_idx[0]
|
||||
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
|
||||
x_cuts = [
|
||||
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
|
||||
]
|
||||
if not x_cuts:
|
||||
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||
for obj in textline._objs:
|
||||
row = table.rows[r]
|
||||
for cut in x_cuts:
|
||||
if isinstance(obj, LTChar):
|
||||
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
||||
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
||||
if (
|
||||
row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
|
||||
and (obj.x0 + obj.x1) / 2 <= cut[1]
|
||||
):
|
||||
cut_text.append((r, cut[0], obj))
|
||||
break
|
||||
else:
|
||||
|
|
@ -485,19 +525,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
|||
cut_text.append((r, cut[0] + 1, obj))
|
||||
elif isinstance(obj, LTAnno):
|
||||
cut_text.append((r, cut[0], obj))
|
||||
elif direction == 'vertical' and not textline.is_empty():
|
||||
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
||||
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
||||
elif direction == "vertical" and not textline.is_empty():
|
||||
y_overlap = [
|
||||
j
|
||||
for j, y in enumerate(table.rows)
|
||||
if y[1] <= bbox[3] and bbox[1] <= y[0]
|
||||
]
|
||||
c_idx = [
|
||||
i
|
||||
for i, c in enumerate(table.cols)
|
||||
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
|
||||
]
|
||||
c = c_idx[0]
|
||||
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
|
||||
y_cuts = [
|
||||
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
|
||||
]
|
||||
if not y_cuts:
|
||||
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||
for obj in textline._objs:
|
||||
col = table.cols[c]
|
||||
for cut in y_cuts:
|
||||
if isinstance(obj, LTChar):
|
||||
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
|
||||
(obj.y0 + obj.y1) / 2 >= cut[1]):
|
||||
if (
|
||||
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
|
||||
and (obj.y0 + obj.y1) / 2 >= cut[1]
|
||||
):
|
||||
cut_text.append((cut[0], c, obj))
|
||||
break
|
||||
else:
|
||||
|
|
@ -511,15 +563,24 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
|||
grouped_chars = []
|
||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||
if flag_size:
|
||||
grouped_chars.append((key[0], key[1],
|
||||
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
|
||||
grouped_chars.append(
|
||||
(
|
||||
key[0],
|
||||
key[1],
|
||||
flag_font_size(
|
||||
[t[2] for t in chars], direction, strip_text=strip_text
|
||||
),
|
||||
)
|
||||
)
|
||||
else:
|
||||
gchars = [t[2].get_text() for t in chars]
|
||||
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
|
||||
grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
|
||||
return grouped_chars
|
||||
|
||||
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
|
||||
def get_table_index(
|
||||
table, t, direction, split_text=False, flag_size=False, strip_text=""
|
||||
):
|
||||
"""Gets indices of the table cell where given text object lies by
|
||||
comparing their y and x-coordinates.
|
||||
|
||||
|
|
@ -558,8 +619,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
|||
"""
|
||||
r_idx, c_idx = [-1] * 2
|
||||
for r in range(len(table.rows)):
|
||||
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
|
||||
(t.y0 + t.y1) / 2.0 > table.rows[r][1]):
|
||||
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
|
||||
r
|
||||
][1]:
|
||||
lt_col_overlap = []
|
||||
for c in table.cols:
|
||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||
|
|
@ -569,11 +631,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
|||
else:
|
||||
lt_col_overlap.append(-1)
|
||||
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
|
||||
text = t.get_text().strip('\n')
|
||||
text = t.get_text().strip("\n")
|
||||
text_range = (t.x0, t.x1)
|
||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||
warnings.warn("{} {} does not lie in column range {}".format(
|
||||
text, text_range, col_range))
|
||||
warnings.warn(
|
||||
"{} {} does not lie in column range {}".format(
|
||||
text, text_range, col_range
|
||||
)
|
||||
)
|
||||
r_idx = r
|
||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||
break
|
||||
|
|
@ -594,10 +659,24 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
|||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||
|
||||
if split_text:
|
||||
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
|
||||
return (
|
||||
split_textline(
|
||||
table, t, direction, flag_size=flag_size, strip_text=strip_text
|
||||
),
|
||||
error,
|
||||
)
|
||||
else:
|
||||
if flag_size:
|
||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
|
||||
return (
|
||||
[
|
||||
(
|
||||
r_idx,
|
||||
c_idx,
|
||||
flag_font_size(t._objs, direction, strip_text=strip_text),
|
||||
)
|
||||
],
|
||||
error,
|
||||
)
|
||||
else:
|
||||
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
||||
|
||||
|
|
@ -650,14 +729,20 @@ def compute_whitespace(d):
|
|||
r_nempty_cells, c_nempty_cells = [], []
|
||||
for i in d:
|
||||
for j in i:
|
||||
if j.strip() == '':
|
||||
if j.strip() == "":
|
||||
whitespace += 1
|
||||
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
||||
return whitespace
|
||||
|
||||
|
||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||
detect_vertical=True, all_texts=True):
|
||||
def get_page_layout(
|
||||
filename,
|
||||
char_margin=1.0,
|
||||
line_margin=0.5,
|
||||
word_margin=0.1,
|
||||
detect_vertical=True,
|
||||
all_texts=True,
|
||||
):
|
||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||
of kwargs.
|
||||
|
|
@ -680,16 +765,18 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|||
Dimension of pdf page in the form (width, height).
|
||||
|
||||
"""
|
||||
with open(filename, 'rb') as f:
|
||||
with open(filename, "rb") as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams(char_margin=char_margin,
|
||||
line_margin=line_margin,
|
||||
word_margin=word_margin,
|
||||
detect_vertical=detect_vertical,
|
||||
all_texts=all_texts)
|
||||
laparams = LAParams(
|
||||
char_margin=char_margin,
|
||||
line_margin=line_margin,
|
||||
word_margin=word_margin,
|
||||
detect_vertical=detect_vertical,
|
||||
all_texts=all_texts,
|
||||
)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
|
@ -721,13 +808,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
|||
List of PDFMiner text objects.
|
||||
|
||||
"""
|
||||
if ltype == 'char':
|
||||
if ltype == "char":
|
||||
LTObject = LTChar
|
||||
elif ltype == 'image':
|
||||
elif ltype == "image":
|
||||
LTObject = LTImage
|
||||
elif ltype == 'horizontal_text':
|
||||
elif ltype == "horizontal_text":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == 'vertical_text':
|
||||
elif ltype == "vertical_text":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
|
|
|
|||
Loading…
Reference in New Issue