Blacken code
parent
27d55d056c
commit
2115a0e177
|
|
@ -9,8 +9,8 @@ from .io import read_pdf
|
||||||
from .plotting import PlotMethods
|
from .plotting import PlotMethods
|
||||||
|
|
||||||
|
|
||||||
def _write_usage(self, prog, args='', prefix='Usage: '):
|
def _write_usage(self, prog, args="", prefix="Usage: "):
|
||||||
return self._write_usage('camelot', args, prefix=prefix)
|
return self._write_usage("camelot", args, prefix=prefix)
|
||||||
|
|
||||||
|
|
||||||
# monkey patch click.HelpFormatter
|
# monkey patch click.HelpFormatter
|
||||||
|
|
@ -18,10 +18,10 @@ HelpFormatter._write_usage = HelpFormatter.write_usage
|
||||||
HelpFormatter.write_usage = _write_usage
|
HelpFormatter.write_usage = _write_usage
|
||||||
|
|
||||||
# set up logging
|
# set up logging
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger("camelot")
|
||||||
|
|
||||||
format_string = '%(asctime)s - %(levelname)s - %(message)s'
|
format_string = "%(asctime)s - %(levelname)s - %(message)s"
|
||||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
|
||||||
handler = logging.StreamHandler()
|
handler = logging.StreamHandler()
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
|
||||||
__all__ = ('main',)
|
__all__ = ("main",)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
|
|
@ -1,23 +1,23 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
VERSION = (0, 7, 2)
|
VERSION = (0, 7, 2)
|
||||||
PRERELEASE = None # alpha, beta or rc
|
PRERELEASE = None # alpha, beta or rc
|
||||||
REVISION = None
|
REVISION = None
|
||||||
|
|
||||||
|
|
||||||
def generate_version(version, prerelease=None, revision=None):
|
def generate_version(version, prerelease=None, revision=None):
|
||||||
version_parts = ['.'.join(map(str, version))]
|
version_parts = [".".join(map(str, version))]
|
||||||
if prerelease is not None:
|
if prerelease is not None:
|
||||||
version_parts.append('-{}'.format(prerelease))
|
version_parts.append("-{}".format(prerelease))
|
||||||
if revision is not None:
|
if revision is not None:
|
||||||
version_parts.append('.{}'.format(revision))
|
version_parts.append(".{}".format(revision))
|
||||||
return ''.join(version_parts)
|
return "".join(version_parts)
|
||||||
|
|
||||||
|
|
||||||
__title__ = 'camelot-py'
|
__title__ = "camelot-py"
|
||||||
__description__ = 'PDF Table Extraction for Humans.'
|
__description__ = "PDF Table Extraction for Humans."
|
||||||
__url__ = 'http://camelot-py.readthedocs.io/'
|
__url__ = "http://camelot-py.readthedocs.io/"
|
||||||
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
|
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
|
||||||
__author__ = 'Vinayak Mehta'
|
__author__ = "Vinayak Mehta"
|
||||||
__author_email__ = 'vmehta94@gmail.com'
|
__author_email__ = "vmehta94@gmail.com"
|
||||||
__license__ = 'MIT License'
|
__license__ = "MIT License"
|
||||||
|
|
|
||||||
341
camelot/cli.py
341
camelot/cli.py
|
|
@ -3,6 +3,7 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
@ -13,7 +14,7 @@ else:
|
||||||
from . import __version__, read_pdf, plot
|
from . import __version__, read_pdf, plot
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger("camelot")
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -30,23 +31,47 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
|
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
|
||||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
@click.option(
|
||||||
' Example: 1,3,4 or 1,4-end or all.')
|
"-p",
|
||||||
@click.option('-pw', '--password', help='Password for decryption.')
|
"--pages",
|
||||||
@click.option('-o', '--output', help='Output file path.')
|
default="1",
|
||||||
@click.option('-f', '--format',
|
help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
|
||||||
type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
|
)
|
||||||
help='Output file format.')
|
@click.option("-pw", "--password", help="Password for decryption.")
|
||||||
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
@click.option("-o", "--output", help="Output file path.")
|
||||||
@click.option('-split', '--split_text', is_flag=True,
|
@click.option(
|
||||||
help='Split text that spans across multiple cells.')
|
"-f",
|
||||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
"--format",
|
||||||
' font size. Useful to detect super/subscripts.')
|
type=click.Choice(["csv", "json", "excel", "html", "sqlite"]),
|
||||||
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
|
help="Output file format.",
|
||||||
' assigning it to a cell.')
|
)
|
||||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
|
||||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
@click.option(
|
||||||
|
"-split",
|
||||||
|
"--split_text",
|
||||||
|
is_flag=True,
|
||||||
|
help="Split text that spans across multiple cells.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-flag",
|
||||||
|
"--flag_size",
|
||||||
|
is_flag=True,
|
||||||
|
help="Flag text based on" " font size. Useful to detect super/subscripts.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-strip",
|
||||||
|
"--strip_text",
|
||||||
|
help="Characters that should be stripped from a string before"
|
||||||
|
" assigning it to a cell.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-M",
|
||||||
|
"--margins",
|
||||||
|
nargs=3,
|
||||||
|
default=(1.0, 0.5, 0.1),
|
||||||
|
help="PDFMiner char_margin, line_margin and word_margin.",
|
||||||
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def cli(ctx, *args, **kwargs):
|
def cli(ctx, *args, **kwargs):
|
||||||
"""Camelot: PDF Table Extraction for Humans"""
|
"""Camelot: PDF Table Extraction for Humans"""
|
||||||
|
|
@ -55,79 +80,131 @@ def cli(ctx, *args, **kwargs):
|
||||||
ctx.obj.set_config(key, value)
|
ctx.obj.set_config(key, value)
|
||||||
|
|
||||||
|
|
||||||
@cli.command('lattice')
|
@cli.command("lattice")
|
||||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
@click.option(
|
||||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
"-R",
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
"--table_regions",
|
||||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
default=[],
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
multiple=True,
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||||
@click.option('-back', '--process_background', is_flag=True,
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
help='Process background lines.')
|
)
|
||||||
@click.option('-scale', '--line_scale', default=15,
|
@click.option(
|
||||||
help='Line size scaling factor. The larger the value,'
|
"-T",
|
||||||
' the smaller the detected lines.')
|
"--table_areas",
|
||||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
default=[],
|
||||||
multiple=True, help='Direction in which text in a spanning cell'
|
multiple=True,
|
||||||
' will be copied over.')
|
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||||
@click.option('-shift', '--shift_text', default=['l', 't'],
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
)
|
||||||
help='Direction in which text in a spanning cell will flow.')
|
@click.option(
|
||||||
@click.option('-l', '--line_tol', default=2,
|
"-back", "--process_background", is_flag=True, help="Process background lines."
|
||||||
help='Tolerance parameter used to merge close vertical'
|
)
|
||||||
' and horizontal lines.')
|
@click.option(
|
||||||
@click.option('-j', '--joint_tol', default=2,
|
"-scale",
|
||||||
help='Tolerance parameter used to decide whether'
|
"--line_scale",
|
||||||
' the detected lines and points lie close to each other.')
|
default=15,
|
||||||
@click.option('-block', '--threshold_blocksize', default=15,
|
help="Line size scaling factor. The larger the value,"
|
||||||
help='For adaptive thresholding, size of a pixel'
|
" the smaller the detected lines.",
|
||||||
' neighborhood that is used to calculate a threshold value for'
|
)
|
||||||
' the pixel. Example: 3, 5, 7, and so on.')
|
@click.option(
|
||||||
@click.option('-const', '--threshold_constant', default=-2,
|
"-copy",
|
||||||
help='For adaptive thresholding, constant subtracted'
|
"--copy_text",
|
||||||
' from the mean or weighted mean. Normally, it is positive but'
|
default=[],
|
||||||
' may be zero or negative as well.')
|
type=click.Choice(["h", "v"]),
|
||||||
@click.option('-I', '--iterations', default=0,
|
multiple=True,
|
||||||
help='Number of times for erosion/dilation will be applied.')
|
help="Direction in which text in a spanning cell" " will be copied over.",
|
||||||
@click.option('-res', '--resolution', default=300,
|
)
|
||||||
help='Resolution used for PDF to PNG conversion.')
|
@click.option(
|
||||||
@click.option('-plot', '--plot_type',
|
"-shift",
|
||||||
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
|
"--shift_text",
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
default=["l", "t"],
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
type=click.Choice(["", "l", "r", "t", "b"]),
|
||||||
|
multiple=True,
|
||||||
|
help="Direction in which text in a spanning cell will flow.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-l",
|
||||||
|
"--line_tol",
|
||||||
|
default=2,
|
||||||
|
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-j",
|
||||||
|
"--joint_tol",
|
||||||
|
default=2,
|
||||||
|
help="Tolerance parameter used to decide whether"
|
||||||
|
" the detected lines and points lie close to each other.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-block",
|
||||||
|
"--threshold_blocksize",
|
||||||
|
default=15,
|
||||||
|
help="For adaptive thresholding, size of a pixel"
|
||||||
|
" neighborhood that is used to calculate a threshold value for"
|
||||||
|
" the pixel. Example: 3, 5, 7, and so on.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-const",
|
||||||
|
"--threshold_constant",
|
||||||
|
default=-2,
|
||||||
|
help="For adaptive thresholding, constant subtracted"
|
||||||
|
" from the mean or weighted mean. Normally, it is positive but"
|
||||||
|
" may be zero or negative as well.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-I",
|
||||||
|
"--iterations",
|
||||||
|
default=0,
|
||||||
|
help="Number of times for erosion/dilation will be applied.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-res",
|
||||||
|
"--resolution",
|
||||||
|
default=300,
|
||||||
|
help="Resolution used for PDF to PNG conversion.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-plot",
|
||||||
|
"--plot_type",
|
||||||
|
type=click.Choice(["text", "grid", "contour", "joint", "line"]),
|
||||||
|
help="Plot elements found on PDF page for visual debugging.",
|
||||||
|
)
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
def lattice(c, *args, **kwargs):
|
def lattice(c, *args, **kwargs):
|
||||||
"""Use lines between text to parse the table."""
|
"""Use lines between text to parse the table."""
|
||||||
conf = c.config
|
conf = c.config
|
||||||
pages = conf.pop('pages')
|
pages = conf.pop("pages")
|
||||||
output = conf.pop('output')
|
output = conf.pop("output")
|
||||||
f = conf.pop('format')
|
f = conf.pop("format")
|
||||||
compress = conf.pop('zip')
|
compress = conf.pop("zip")
|
||||||
quiet = conf.pop('quiet')
|
quiet = conf.pop("quiet")
|
||||||
plot_type = kwargs.pop('plot_type')
|
plot_type = kwargs.pop("plot_type")
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop("filepath")
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_regions = list(kwargs['table_regions'])
|
table_regions = list(kwargs["table_regions"])
|
||||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||||
table_areas = list(kwargs['table_areas'])
|
table_areas = list(kwargs["table_areas"])
|
||||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||||
copy_text = list(kwargs['copy_text'])
|
copy_text = list(kwargs["copy_text"])
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
kwargs["copy_text"] = None if not copy_text else copy_text
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
kwargs["shift_text"] = list(kwargs["shift_text"])
|
||||||
|
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
if not _HAS_MPL:
|
if not _HAS_MPL:
|
||||||
raise ImportError('matplotlib is required for plotting.')
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output file path using --output')
|
raise click.UsageError("Please specify output file path using --output")
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output file format using --format')
|
raise click.UsageError("Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(filepath, pages=pages, flavor='lattice',
|
tables = read_pdf(
|
||||||
suppress_stdout=quiet, **kwargs)
|
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
||||||
click.echo('Found {} tables'.format(tables.n))
|
)
|
||||||
|
click.echo("Found {} tables".format(tables.n))
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
for table in tables:
|
||||||
plot(table, kind=plot_type)
|
plot(table, kind=plot_type)
|
||||||
|
|
@ -136,57 +213,89 @@ def lattice(c, *args, **kwargs):
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
@cli.command('stream')
|
@cli.command("stream")
|
||||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
@click.option(
|
||||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
"-R",
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
"--table_regions",
|
||||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
default=[],
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
multiple=True,
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||||
@click.option('-C', '--columns', default=[], multiple=True,
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
help='X coordinates of column separators.')
|
)
|
||||||
@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
|
@click.option(
|
||||||
' for extending textedges vertically.')
|
"-T",
|
||||||
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
|
"--table_areas",
|
||||||
' used to combine text vertically, to generate rows.')
|
default=[],
|
||||||
@click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
|
multiple=True,
|
||||||
' used to combine text horizontally, to generate columns.')
|
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||||
@click.option('-plot', '--plot_type',
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
)
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
@click.option(
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
"-C",
|
||||||
|
"--columns",
|
||||||
|
default=[],
|
||||||
|
multiple=True,
|
||||||
|
help="X coordinates of column separators.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-e",
|
||||||
|
"--edge_tol",
|
||||||
|
default=50,
|
||||||
|
help="Tolerance parameter" " for extending textedges vertically.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-r",
|
||||||
|
"--row_tol",
|
||||||
|
default=2,
|
||||||
|
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-c",
|
||||||
|
"--column_tol",
|
||||||
|
default=0,
|
||||||
|
help="Tolerance parameter"
|
||||||
|
" used to combine text horizontally, to generate columns.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-plot",
|
||||||
|
"--plot_type",
|
||||||
|
type=click.Choice(["text", "grid", "contour", "textedge"]),
|
||||||
|
help="Plot elements found on PDF page for visual debugging.",
|
||||||
|
)
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
def stream(c, *args, **kwargs):
|
def stream(c, *args, **kwargs):
|
||||||
"""Use spaces between text to parse the table."""
|
"""Use spaces between text to parse the table."""
|
||||||
conf = c.config
|
conf = c.config
|
||||||
pages = conf.pop('pages')
|
pages = conf.pop("pages")
|
||||||
output = conf.pop('output')
|
output = conf.pop("output")
|
||||||
f = conf.pop('format')
|
f = conf.pop("format")
|
||||||
compress = conf.pop('zip')
|
compress = conf.pop("zip")
|
||||||
quiet = conf.pop('quiet')
|
quiet = conf.pop("quiet")
|
||||||
plot_type = kwargs.pop('plot_type')
|
plot_type = kwargs.pop("plot_type")
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop("filepath")
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_regions = list(kwargs['table_regions'])
|
table_regions = list(kwargs["table_regions"])
|
||||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||||
table_areas = list(kwargs['table_areas'])
|
table_areas = list(kwargs["table_areas"])
|
||||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs["columns"])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs["columns"] = None if not columns else columns
|
||||||
|
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
if not _HAS_MPL:
|
if not _HAS_MPL:
|
||||||
raise ImportError('matplotlib is required for plotting.')
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output file path using --output')
|
raise click.UsageError("Please specify output file path using --output")
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output file format using --format')
|
raise click.UsageError("Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(filepath, pages=pages, flavor='stream',
|
tables = read_pdf(
|
||||||
suppress_stdout=quiet, **kwargs)
|
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
||||||
click.echo('Found {} tables'.format(tables.n))
|
)
|
||||||
|
click.echo("Found {} tables".format(tables.n))
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
for table in tables:
|
||||||
plot(table, kind=plot_type)
|
plot(table, kind=plot_type)
|
||||||
|
|
|
||||||
201
camelot/core.py
201
camelot/core.py
|
|
@ -42,7 +42,8 @@ class TextEdge(object):
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, x, y0, y1, align='left'):
|
|
||||||
|
def __init__(self, x, y0, y1, align="left"):
|
||||||
self.x = x
|
self.x = x
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.y1 = y1
|
self.y1 = y1
|
||||||
|
|
@ -51,8 +52,13 @@ class TextEdge(object):
|
||||||
self.is_valid = False
|
self.is_valid = False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
|
||||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
round(self.x, 2),
|
||||||
|
round(self.y0, 2),
|
||||||
|
round(self.y1, 2),
|
||||||
|
self.align,
|
||||||
|
self.is_valid,
|
||||||
|
)
|
||||||
|
|
||||||
def update_coords(self, x, y0, edge_tol=50):
|
def update_coords(self, x, y0, edge_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
|
|
@ -73,9 +79,10 @@ class TextEdges(object):
|
||||||
the PDF page. The dict has three keys based on the alignments,
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
and each key's value is a list of camelot.core.TextEdge objects.
|
and each key's value is a list of camelot.core.TextEdge objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, edge_tol=50):
|
def __init__(self, edge_tol=50):
|
||||||
self.edge_tol = edge_tol
|
self.edge_tol = edge_tol
|
||||||
self._textedges = {'left': [], 'right': [], 'middle': []}
|
self._textedges = {"left": [], "right": [], "middle": []}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_x_coord(textline, align):
|
def get_x_coord(textline, align):
|
||||||
|
|
@ -85,7 +92,7 @@ class TextEdges(object):
|
||||||
x_left = textline.x0
|
x_left = textline.x0
|
||||||
x_right = textline.x1
|
x_right = textline.x1
|
||||||
x_middle = x_left + (x_right - x_left) / 2.0
|
x_middle = x_left + (x_right - x_left) / 2.0
|
||||||
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
|
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
|
||||||
return x_coord[align]
|
return x_coord[align]
|
||||||
|
|
||||||
def find(self, x_coord, align):
|
def find(self, x_coord, align):
|
||||||
|
|
@ -109,21 +116,22 @@ class TextEdges(object):
|
||||||
def update(self, textline):
|
def update(self, textline):
|
||||||
"""Updates an existing text edge in the current dict.
|
"""Updates an existing text edge in the current dict.
|
||||||
"""
|
"""
|
||||||
for align in ['left', 'right', 'middle']:
|
for align in ["left", "right", "middle"]:
|
||||||
x_coord = self.get_x_coord(textline, align)
|
x_coord = self.get_x_coord(textline, align)
|
||||||
idx = self.find(x_coord, align)
|
idx = self.find(x_coord, align)
|
||||||
if idx is None:
|
if idx is None:
|
||||||
self.add(textline, align)
|
self.add(textline, align)
|
||||||
else:
|
else:
|
||||||
self._textedges[align][idx].update_coords(
|
self._textedges[align][idx].update_coords(
|
||||||
x_coord, textline.y0, edge_tol=self.edge_tol)
|
x_coord, textline.y0, edge_tol=self.edge_tol
|
||||||
|
)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
"""Generates the text edges dict based on horizontal text
|
"""Generates the text edges dict based on horizontal text
|
||||||
rows.
|
rows.
|
||||||
"""
|
"""
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update(tl)
|
self.update(tl)
|
||||||
|
|
||||||
def get_relevant(self):
|
def get_relevant(self):
|
||||||
|
|
@ -132,9 +140,15 @@ class TextEdges(object):
|
||||||
the most.
|
the most.
|
||||||
"""
|
"""
|
||||||
intersections_sum = {
|
intersections_sum = {
|
||||||
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
|
"left": sum(
|
||||||
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
|
te.intersections for te in self._textedges["left"] if te.is_valid
|
||||||
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
|
),
|
||||||
|
"right": sum(
|
||||||
|
te.intersections for te in self._textedges["right"] if te.is_valid
|
||||||
|
),
|
||||||
|
"middle": sum(
|
||||||
|
te.intersections for te in self._textedges["middle"] if te.is_valid
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: naive
|
# TODO: naive
|
||||||
|
|
@ -147,6 +161,7 @@ class TextEdges(object):
|
||||||
"""Returns a dict of interesting table areas on the PDF page
|
"""Returns a dict of interesting table areas on the PDF page
|
||||||
calculated using relevant text edges.
|
calculated using relevant text edges.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pad(area, average_row_height):
|
def pad(area, average_row_height):
|
||||||
x0 = area[0] - TABLE_AREA_PADDING
|
x0 = area[0] - TABLE_AREA_PADDING
|
||||||
y0 = area[1] - TABLE_AREA_PADDING
|
y0 = area[1] - TABLE_AREA_PADDING
|
||||||
|
|
@ -175,7 +190,11 @@ class TextEdges(object):
|
||||||
else:
|
else:
|
||||||
table_areas.pop(found)
|
table_areas.pop(found)
|
||||||
updated_area = (
|
updated_area = (
|
||||||
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
|
found[0],
|
||||||
|
min(te.y0, found[1]),
|
||||||
|
max(found[2], te.x),
|
||||||
|
max(found[3], te.y1),
|
||||||
|
)
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
|
|
||||||
# extend table areas based on textlines that overlap
|
# extend table areas based on textlines that overlap
|
||||||
|
|
@ -196,7 +215,11 @@ class TextEdges(object):
|
||||||
if found is not None:
|
if found is not None:
|
||||||
table_areas.pop(found)
|
table_areas.pop(found)
|
||||||
updated_area = (
|
updated_area = (
|
||||||
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
|
min(tl.x0, found[0]),
|
||||||
|
min(tl.y0, found[1]),
|
||||||
|
max(found[2], tl.x1),
|
||||||
|
max(found[3], tl.y1),
|
||||||
|
)
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
average_textline_height = sum_textline_height / float(len(textlines))
|
average_textline_height = sum_textline_height / float(len(textlines))
|
||||||
|
|
||||||
|
|
@ -265,11 +288,12 @@ class Cell(object):
|
||||||
self.bottom = False
|
self.bottom = False
|
||||||
self.hspan = False
|
self.hspan = False
|
||||||
self.vspan = False
|
self.vspan = False
|
||||||
self._text = ''
|
self._text = ""
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<Cell x1={} y1={} x2={} y2={}>'.format(
|
return "<Cell x1={} y1={} x2={} y2={}>".format(
|
||||||
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2))
|
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
|
|
@ -277,7 +301,7 @@ class Cell(object):
|
||||||
|
|
||||||
@text.setter
|
@text.setter
|
||||||
def text(self, t):
|
def text(self, t):
|
||||||
self._text = ''.join([self._text, t])
|
self._text = "".join([self._text, t])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bound(self):
|
def bound(self):
|
||||||
|
|
@ -314,11 +338,11 @@ class Table(object):
|
||||||
PDF page number.
|
PDF page number.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
self.cols = cols
|
self.cols = cols
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
|
||||||
for c in cols] for r in rows]
|
|
||||||
self.df = None
|
self.df = None
|
||||||
self.shape = (0, 0)
|
self.shape = (0, 0)
|
||||||
self.accuracy = 0
|
self.accuracy = 0
|
||||||
|
|
@ -327,7 +351,7 @@ class Table(object):
|
||||||
self.page = None
|
self.page = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<{} shape={}>'.format(self.__class__.__name__, self.shape)
|
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||||
|
|
||||||
def __lt__(self, other):
|
def __lt__(self, other):
|
||||||
if self.page == other.page:
|
if self.page == other.page:
|
||||||
|
|
@ -352,10 +376,10 @@ class Table(object):
|
||||||
"""
|
"""
|
||||||
# pretty?
|
# pretty?
|
||||||
report = {
|
report = {
|
||||||
'accuracy': round(self.accuracy, 2),
|
"accuracy": round(self.accuracy, 2),
|
||||||
'whitespace': round(self.whitespace, 2),
|
"whitespace": round(self.whitespace, 2),
|
||||||
'order': self.order,
|
"order": self.order,
|
||||||
'page': self.page
|
"page": self.page,
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
|
@ -383,12 +407,21 @@ class Table(object):
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
# find closest x coord
|
# find closest x coord
|
||||||
# iterate over y coords and find closest start and end points
|
# iterate over y coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.cols)
|
i = [
|
||||||
if np.isclose(v[0], t[0], atol=joint_tol)]
|
i
|
||||||
j = [j for j, t in enumerate(self.rows)
|
for i, t in enumerate(self.cols)
|
||||||
if np.isclose(v[3], t[0], atol=joint_tol)]
|
if np.isclose(v[0], t[0], atol=joint_tol)
|
||||||
k = [k for k, t in enumerate(self.rows)
|
]
|
||||||
if np.isclose(v[1], t[0], atol=joint_tol)]
|
j = [
|
||||||
|
j
|
||||||
|
for j, t in enumerate(self.rows)
|
||||||
|
if np.isclose(v[3], t[0], atol=joint_tol)
|
||||||
|
]
|
||||||
|
k = [
|
||||||
|
k
|
||||||
|
for k, t in enumerate(self.rows)
|
||||||
|
if np.isclose(v[1], t[0], atol=joint_tol)
|
||||||
|
]
|
||||||
if not j:
|
if not j:
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
|
|
@ -434,12 +467,21 @@ class Table(object):
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
# find closest y coord
|
# find closest y coord
|
||||||
# iterate over x coords and find closest start and end points
|
# iterate over x coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.rows)
|
i = [
|
||||||
if np.isclose(h[1], t[0], atol=joint_tol)]
|
i
|
||||||
j = [j for j, t in enumerate(self.cols)
|
for i, t in enumerate(self.rows)
|
||||||
if np.isclose(h[0], t[0], atol=joint_tol)]
|
if np.isclose(h[1], t[0], atol=joint_tol)
|
||||||
k = [k for k, t in enumerate(self.cols)
|
]
|
||||||
if np.isclose(h[2], t[0], atol=joint_tol)]
|
j = [
|
||||||
|
j
|
||||||
|
for j, t in enumerate(self.cols)
|
||||||
|
if np.isclose(h[0], t[0], atol=joint_tol)
|
||||||
|
]
|
||||||
|
k = [
|
||||||
|
k
|
||||||
|
for k, t in enumerate(self.cols)
|
||||||
|
if np.isclose(h[2], t[0], atol=joint_tol)
|
||||||
|
]
|
||||||
if not j:
|
if not j:
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
|
|
@ -537,12 +579,7 @@ class Table(object):
|
||||||
Output filepath.
|
Output filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
|
||||||
'encoding': 'utf-8',
|
|
||||||
'index': False,
|
|
||||||
'header': False,
|
|
||||||
'quoting': 1
|
|
||||||
}
|
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
self.df.to_csv(path, **kw)
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
|
|
@ -557,12 +594,10 @@ class Table(object):
|
||||||
Output filepath.
|
Output filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {"orient": "records"}
|
||||||
'orient': 'records'
|
|
||||||
}
|
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
json_string = self.df.to_json(**kw)
|
json_string = self.df.to_json(**kw)
|
||||||
with open(path, 'w') as f:
|
with open(path, "w") as f:
|
||||||
f.write(json_string)
|
f.write(json_string)
|
||||||
|
|
||||||
def to_excel(self, path, **kwargs):
|
def to_excel(self, path, **kwargs):
|
||||||
|
|
@ -577,8 +612,8 @@ class Table(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
"sheet_name": "page-{}-table-{}".format(self.page, self.order),
|
||||||
'encoding': 'utf-8'
|
"encoding": "utf-8",
|
||||||
}
|
}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
writer = pd.ExcelWriter(path)
|
writer = pd.ExcelWriter(path)
|
||||||
|
|
@ -597,7 +632,7 @@ class Table(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
html_string = self.df.to_html(**kwargs)
|
html_string = self.df.to_html(**kwargs)
|
||||||
with open(path, 'w') as f:
|
with open(path, "w") as f:
|
||||||
f.write(html_string)
|
f.write(html_string)
|
||||||
|
|
||||||
def to_sqlite(self, path, **kwargs):
|
def to_sqlite(self, path, **kwargs):
|
||||||
|
|
@ -611,13 +646,10 @@ class Table(object):
|
||||||
Output filepath.
|
Output filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {"if_exists": "replace", "index": False}
|
||||||
'if_exists': 'replace',
|
|
||||||
'index': False
|
|
||||||
}
|
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
conn = sqlite3.connect(path)
|
conn = sqlite3.connect(path)
|
||||||
table_name = 'page-{}-table-{}'.format(self.page, self.order)
|
table_name = "page-{}-table-{}".format(self.page, self.order)
|
||||||
self.df.to_sql(table_name, conn, **kw)
|
self.df.to_sql(table_name, conn, **kw)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
@ -633,12 +665,12 @@ class TableList(object):
|
||||||
Number of tables in the list.
|
Number of tables in the list.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tables):
|
def __init__(self, tables):
|
||||||
self._tables = tables
|
self._tables = tables
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<{} n={}>'.format(
|
return "<{} n={}>".format(self.__class__.__name__, self.n)
|
||||||
self.__class__.__name__, self.n)
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self._tables)
|
return len(self._tables)
|
||||||
|
|
@ -648,37 +680,39 @@ class TableList(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_func(table, f):
|
def _format_func(table, f):
|
||||||
return getattr(table, 'to_{}'.format(f))
|
return getattr(table, "to_{}".format(f))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n(self):
|
def n(self):
|
||||||
return len(self)
|
return len(self)
|
||||||
|
|
||||||
def _write_file(self, f=None, **kwargs):
|
def _write_file(self, f=None, **kwargs):
|
||||||
dirname = kwargs.get('dirname')
|
dirname = kwargs.get("dirname")
|
||||||
root = kwargs.get('root')
|
root = kwargs.get("root")
|
||||||
ext = kwargs.get('ext')
|
ext = kwargs.get("ext")
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
filename = os.path.join(
|
||||||
root, table.page, table.order, ext))
|
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
||||||
|
)
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
to_format = self._format_func(table, f)
|
to_format = self._format_func(table, f)
|
||||||
to_format(filepath)
|
to_format(filepath)
|
||||||
|
|
||||||
def _compress_dir(self, **kwargs):
|
def _compress_dir(self, **kwargs):
|
||||||
path = kwargs.get('path')
|
path = kwargs.get("path")
|
||||||
dirname = kwargs.get('dirname')
|
dirname = kwargs.get("dirname")
|
||||||
root = kwargs.get('root')
|
root = kwargs.get("root")
|
||||||
ext = kwargs.get('ext')
|
ext = kwargs.get("ext")
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
filename = os.path.join(
|
||||||
root, table.page, table.order, ext))
|
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
||||||
|
)
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
def export(self, path, f='csv', compress=False):
|
def export(self, path, f="csv", compress=False):
|
||||||
"""Exports the list of tables to specified file format.
|
"""Exports the list of tables to specified file format.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -697,33 +731,28 @@ class TableList(object):
|
||||||
if compress:
|
if compress:
|
||||||
dirname = tempfile.mkdtemp()
|
dirname = tempfile.mkdtemp()
|
||||||
|
|
||||||
kwargs = {
|
kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
|
||||||
'path': path,
|
|
||||||
'dirname': dirname,
|
|
||||||
'root': root,
|
|
||||||
'ext': ext
|
|
||||||
}
|
|
||||||
|
|
||||||
if f in ['csv', 'json', 'html']:
|
if f in ["csv", "json", "html"]:
|
||||||
self._write_file(f=f, **kwargs)
|
self._write_file(f=f, **kwargs)
|
||||||
if compress:
|
if compress:
|
||||||
self._compress_dir(**kwargs)
|
self._compress_dir(**kwargs)
|
||||||
elif f == 'excel':
|
elif f == "excel":
|
||||||
filepath = os.path.join(dirname, basename)
|
filepath = os.path.join(dirname, basename)
|
||||||
writer = pd.ExcelWriter(filepath)
|
writer = pd.ExcelWriter(filepath)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
|
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
||||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
|
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
||||||
writer.save()
|
writer.save()
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
elif f == 'sqlite':
|
elif f == "sqlite":
|
||||||
filepath = os.path.join(dirname, basename)
|
filepath = os.path.join(dirname, basename)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
table.to_sqlite(filepath)
|
table.to_sqlite(filepath)
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
|
||||||
|
|
@ -24,10 +24,10 @@ ghostscript - A Python interface for the Ghostscript interpreter C-API
|
||||||
from . import _gsprint as gs
|
from . import _gsprint as gs
|
||||||
|
|
||||||
|
|
||||||
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||||
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||||
__license__ = 'GNU General Public License version 3 (GPL v3)'
|
__license__ = "GNU General Public License version 3 (GPL v3)"
|
||||||
__version__ = '0.6'
|
__version__ = "0.6"
|
||||||
|
|
||||||
|
|
||||||
class __Ghostscript(object):
|
class __Ghostscript(object):
|
||||||
|
|
@ -87,10 +87,13 @@ def Ghostscript(*args, **kwargs):
|
||||||
# Ghostscript only supports a single instance
|
# Ghostscript only supports a single instance
|
||||||
if __instance__ is None:
|
if __instance__ is None:
|
||||||
__instance__ = gs.new_instance()
|
__instance__ = gs.new_instance()
|
||||||
return __Ghostscript(__instance__, args,
|
return __Ghostscript(
|
||||||
stdin=kwargs.get('stdin', None),
|
__instance__,
|
||||||
stdout=kwargs.get('stdout', None),
|
args,
|
||||||
stderr=kwargs.get('stderr', None))
|
stdin=kwargs.get("stdin", None),
|
||||||
|
stdout=kwargs.get("stdout", None),
|
||||||
|
stderr=kwargs.get("stderr", None),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__instance__ = None
|
__instance__ = None
|
||||||
|
|
|
||||||
|
|
@ -42,10 +42,10 @@ e_Info = -110
|
||||||
#
|
#
|
||||||
e_Quit = -101
|
e_Quit = -101
|
||||||
|
|
||||||
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||||
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
|
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
|
||||||
__license__ = 'GNU General Public License version 3 (GPL v3)'
|
__license__ = "GNU General Public License version 3 (GPL v3)"
|
||||||
__version__ = '0.6'
|
__version__ = "0.6"
|
||||||
|
|
||||||
gs_main_instance = c_void_p
|
gs_main_instance = c_void_p
|
||||||
display_callback = c_void_p
|
display_callback = c_void_p
|
||||||
|
|
@ -55,7 +55,7 @@ display_callback = c_void_p
|
||||||
|
|
||||||
class GhostscriptError(Exception):
|
class GhostscriptError(Exception):
|
||||||
def __init__(self, ecode):
|
def __init__(self, ecode):
|
||||||
self.code = ecode
|
self.code = ecode
|
||||||
|
|
||||||
|
|
||||||
def new_instance():
|
def new_instance():
|
||||||
|
|
@ -89,6 +89,7 @@ def _wrap_stdin(infp):
|
||||||
"""Wrap a filehandle into a C function to be used as `stdin` callback
|
"""Wrap a filehandle into a C function to be used as `stdin` callback
|
||||||
for ``set_stdio``. The filehandle has to support the readline() method.
|
for ``set_stdio``. The filehandle has to support the readline() method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _wrap(instance, dest, count):
|
def _wrap(instance, dest, count):
|
||||||
try:
|
try:
|
||||||
data = infp.readline(count)
|
data = infp.readline(count)
|
||||||
|
|
@ -110,6 +111,7 @@ def _wrap_stdout(outfp):
|
||||||
`stderr` callback for ``set_stdio``. The filehandle has to support the
|
`stderr` callback for ``set_stdio``. The filehandle has to support the
|
||||||
write() and flush() methods.
|
write() and flush() methods.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _wrap(instance, str, count):
|
def _wrap(instance, str, count):
|
||||||
outfp.write(str[:count])
|
outfp.write(str[:count])
|
||||||
outfp.flush()
|
outfp.flush()
|
||||||
|
|
@ -187,11 +189,23 @@ def __win32_finddll():
|
||||||
import winreg
|
import winreg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# assume Python 2
|
# assume Python 2
|
||||||
from _winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
|
from _winreg import (
|
||||||
QueryInfoKey, HKEY_LOCAL_MACHINE
|
OpenKey,
|
||||||
|
CloseKey,
|
||||||
|
EnumKey,
|
||||||
|
QueryValueEx,
|
||||||
|
QueryInfoKey,
|
||||||
|
HKEY_LOCAL_MACHINE,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
from winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
|
from winreg import (
|
||||||
QueryInfoKey, HKEY_LOCAL_MACHINE
|
OpenKey,
|
||||||
|
CloseKey,
|
||||||
|
EnumKey,
|
||||||
|
QueryValueEx,
|
||||||
|
QueryInfoKey,
|
||||||
|
HKEY_LOCAL_MACHINE,
|
||||||
|
)
|
||||||
|
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
import os
|
import os
|
||||||
|
|
@ -199,15 +213,19 @@ def __win32_finddll():
|
||||||
dlls = []
|
dlls = []
|
||||||
# Look up different variants of Ghostscript and take the highest
|
# Look up different variants of Ghostscript and take the highest
|
||||||
# version for which the DLL is to be found in the filesystem.
|
# version for which the DLL is to be found in the filesystem.
|
||||||
for key_name in ('AFPL Ghostscript', 'Aladdin Ghostscript',
|
for key_name in (
|
||||||
'GNU Ghostscript', 'GPL Ghostscript'):
|
"AFPL Ghostscript",
|
||||||
|
"Aladdin Ghostscript",
|
||||||
|
"GNU Ghostscript",
|
||||||
|
"GPL Ghostscript",
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name)
|
k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name)
|
||||||
for num in range(0, QueryInfoKey(k1)[0]):
|
for num in range(0, QueryInfoKey(k1)[0]):
|
||||||
version = EnumKey(k1, num)
|
version = EnumKey(k1, num)
|
||||||
try:
|
try:
|
||||||
k2 = OpenKey(k1, version)
|
k2 = OpenKey(k1, version)
|
||||||
dll_path = QueryValueEx(k2, 'GS_DLL')[0]
|
dll_path = QueryValueEx(k2, "GS_DLL")[0]
|
||||||
CloseKey(k2)
|
CloseKey(k2)
|
||||||
if os.path.exists(dll_path):
|
if os.path.exists(dll_path):
|
||||||
dlls.append((LooseVersion(version), dll_path))
|
dlls.append((LooseVersion(version), dll_path))
|
||||||
|
|
@ -223,21 +241,21 @@ def __win32_finddll():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
if sys.platform == 'win32':
|
if sys.platform == "win32":
|
||||||
libgs = __win32_finddll()
|
libgs = __win32_finddll()
|
||||||
if not libgs:
|
if not libgs:
|
||||||
raise RuntimeError('Please make sure that Ghostscript is installed')
|
raise RuntimeError("Please make sure that Ghostscript is installed")
|
||||||
libgs = windll.LoadLibrary(libgs)
|
libgs = windll.LoadLibrary(libgs)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
libgs = cdll.LoadLibrary('libgs.so')
|
libgs = cdll.LoadLibrary("libgs.so")
|
||||||
except OSError:
|
except OSError:
|
||||||
# shared object file not found
|
# shared object file not found
|
||||||
import ctypes.util
|
import ctypes.util
|
||||||
|
|
||||||
libgs = ctypes.util.find_library('gs')
|
libgs = ctypes.util.find_library("gs")
|
||||||
if not libgs:
|
if not libgs:
|
||||||
raise RuntimeError('Please make sure that Ghostscript is installed')
|
raise RuntimeError("Please make sure that Ghostscript is installed")
|
||||||
libgs = cdll.LoadLibrary(libgs)
|
libgs = cdll.LoadLibrary(libgs)
|
||||||
|
|
||||||
del __win32_finddll
|
del __win32_finddll
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,14 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
from .utils import (
|
||||||
get_rotation, is_url, download_url)
|
TemporaryDirectory,
|
||||||
|
get_page_layout,
|
||||||
|
get_text_objects,
|
||||||
|
get_rotation,
|
||||||
|
is_url,
|
||||||
|
download_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
|
|
@ -27,19 +33,20 @@ class PDFHandler(object):
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filepath, pages='1', password=None):
|
|
||||||
|
def __init__(self, filepath, pages="1", password=None):
|
||||||
if is_url(filepath):
|
if is_url(filepath):
|
||||||
filepath = download_url(filepath)
|
filepath = download_url(filepath)
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
if not filepath.lower().endswith('.pdf'):
|
if not filepath.lower().endswith(".pdf"):
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
|
|
||||||
if password is None:
|
if password is None:
|
||||||
self.password = ''
|
self.password = ""
|
||||||
else:
|
else:
|
||||||
self.password = password
|
self.password = password
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
self.password = self.password.encode('ascii')
|
self.password = self.password.encode("ascii")
|
||||||
self.pages = self._get_pages(self.filepath, pages)
|
self.pages = self._get_pages(self.filepath, pages)
|
||||||
|
|
||||||
def _get_pages(self, filepath, pages):
|
def _get_pages(self, filepath, pages):
|
||||||
|
|
@ -60,26 +67,26 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
page_numbers = []
|
page_numbers = []
|
||||||
if pages == '1':
|
if pages == "1":
|
||||||
page_numbers.append({'start': 1, 'end': 1})
|
page_numbers.append({"start": 1, "end": 1})
|
||||||
else:
|
else:
|
||||||
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
|
infile = PdfFileReader(open(filepath, "rb"), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
if pages == 'all':
|
if pages == "all":
|
||||||
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
page_numbers.append({"start": 1, "end": infile.getNumPages()})
|
||||||
else:
|
else:
|
||||||
for r in pages.split(','):
|
for r in pages.split(","):
|
||||||
if '-' in r:
|
if "-" in r:
|
||||||
a, b = r.split('-')
|
a, b = r.split("-")
|
||||||
if b == 'end':
|
if b == "end":
|
||||||
b = infile.getNumPages()
|
b = infile.getNumPages()
|
||||||
page_numbers.append({'start': int(a), 'end': int(b)})
|
page_numbers.append({"start": int(a), "end": int(b)})
|
||||||
else:
|
else:
|
||||||
page_numbers.append({'start': int(r), 'end': int(r)})
|
page_numbers.append({"start": int(r), "end": int(r)})
|
||||||
P = []
|
P = []
|
||||||
for p in page_numbers:
|
for p in page_numbers:
|
||||||
P.extend(range(p['start'], p['end'] + 1))
|
P.extend(range(p["start"], p["end"] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filepath, page, temp):
|
def _save_page(self, filepath, page, temp):
|
||||||
|
|
@ -95,16 +102,16 @@ class PDFHandler(object):
|
||||||
Tmp directory.
|
Tmp directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filepath, 'rb') as fileobj:
|
with open(filepath, "rb") as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, dim = get_page_layout(fpath)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
|
|
@ -112,23 +119,25 @@ class PDFHandler(object):
|
||||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != '':
|
if rotation != "":
|
||||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
p = infile.getPage(0)
|
p = infile.getPage(0)
|
||||||
if rotation == 'anticlockwise':
|
if rotation == "anticlockwise":
|
||||||
p.rotateClockwise(90)
|
p.rotateClockwise(90)
|
||||||
elif rotation == 'clockwise':
|
elif rotation == "clockwise":
|
||||||
p.rotateCounterClockwise(90)
|
p.rotateCounterClockwise(90)
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
|
def parse(
|
||||||
|
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
||||||
|
):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
||||||
|
|
@ -154,11 +163,13 @@ class PDFHandler(object):
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filepath, p, tempdir)
|
self._save_page(self.filepath, p, tempdir)
|
||||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
pages = [
|
||||||
for p in self.pages]
|
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
]
|
||||||
|
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
|
t = parser.extract_tables(
|
||||||
layout_kwargs=layout_kwargs)
|
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
||||||
|
)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
|
||||||
|
|
@ -39,17 +39,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
if process_background:
|
if process_background:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||||
cv2.THRESH_BINARY, blocksize, c)
|
)
|
||||||
else:
|
else:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
np.invert(gray), 255,
|
np.invert(gray),
|
||||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
|
255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
blocksize,
|
||||||
|
c,
|
||||||
|
)
|
||||||
return img, threshold
|
return img, threshold
|
||||||
|
|
||||||
|
|
||||||
def find_lines(threshold, regions=None, direction='horizontal',
|
def find_lines(
|
||||||
line_scale=15, iterations=0):
|
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
||||||
|
):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
||||||
|
|
@ -87,15 +93,14 @@ def find_lines(threshold, regions=None, direction='horizontal',
|
||||||
"""
|
"""
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
if direction == 'vertical':
|
if direction == "vertical":
|
||||||
size = threshold.shape[0] // line_scale
|
size = threshold.shape[0] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
elif direction == 'horizontal':
|
elif direction == "horizontal":
|
||||||
size = threshold.shape[1] // line_scale
|
size = threshold.shape[1] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
elif direction is None:
|
elif direction is None:
|
||||||
raise ValueError("Specify direction as either 'vertical' or"
|
raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'")
|
||||||
" 'horizontal'")
|
|
||||||
|
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
region_mask = np.zeros(threshold.shape)
|
region_mask = np.zeros(threshold.shape)
|
||||||
|
|
@ -110,19 +115,21 @@ def find_lines(threshold, regions=None, direction='horizontal',
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
|
|
||||||
for c in contours:
|
for c in contours:
|
||||||
x, y, w, h = cv2.boundingRect(c)
|
x, y, w, h = cv2.boundingRect(c)
|
||||||
x1, x2 = x, x + w
|
x1, x2 = x, x + w
|
||||||
y1, y2 = y, y + h
|
y1, y2 = y, y + h
|
||||||
if direction == 'vertical':
|
if direction == "vertical":
|
||||||
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
|
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
|
||||||
elif direction == 'horizontal':
|
elif direction == "horizontal":
|
||||||
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
|
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
|
||||||
|
|
||||||
return dmask, lines
|
return dmask, lines
|
||||||
|
|
@ -150,11 +157,13 @@ def find_contours(vertical, horizontal):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
__, contours, __ = cv2.findContours(
|
__, contours, __ = cv2.findContours(
|
||||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, __ = cv2.findContours(
|
contours, __ = cv2.findContours(
|
||||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
# sort in reverse based on contour area and use first 10 contours
|
# sort in reverse based on contour area and use first 10 contours
|
||||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||||
|
|
||||||
|
|
@ -196,11 +205,13 @@ def find_joints(contours, vertical, horizontal):
|
||||||
roi = joints[y : y + h, x : x + w]
|
roi = joints[y : y + h, x : x + w]
|
||||||
try:
|
try:
|
||||||
__, jc, __ = cv2.findContours(
|
__, jc, __ = cv2.findContours(
|
||||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
jc, __ = cv2.findContours(
|
jc, __ = cv2.findContours(
|
||||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
if len(jc) <= 4: # remove contours with less than 4 joints
|
if len(jc) <= 4: # remove contours with less than 4 joints
|
||||||
continue
|
continue
|
||||||
joint_coords = []
|
joint_coords = []
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,15 @@ from .handlers import PDFHandler
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
def read_pdf(
|
||||||
suppress_stdout=False, layout_kwargs={}, **kwargs):
|
filepath,
|
||||||
|
pages="1",
|
||||||
|
password=None,
|
||||||
|
flavor="lattice",
|
||||||
|
suppress_stdout=False,
|
||||||
|
layout_kwargs={},
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -91,9 +98,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if flavor not in ['lattice', 'stream']:
|
if flavor not in ["lattice", "stream"]:
|
||||||
raise NotImplementedError("Unknown flavor specified."
|
raise NotImplementedError(
|
||||||
" Use either 'lattice' or 'stream'")
|
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||||
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
if suppress_stdout:
|
if suppress_stdout:
|
||||||
|
|
@ -102,6 +110,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
|
tables = p.parse(
|
||||||
layout_kwargs=layout_kwargs, **kwargs)
|
flavor=flavor,
|
||||||
|
suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
return tables
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,13 @@ from ..utils import get_page_layout, get_text_objects
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _generate_layout(self, filename, layout_kwargs):
|
def _generate_layout(self, filename, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
|
||||||
filename, **layout_kwargs)
|
self.images = get_text_objects(self.layout, ltype="image")
|
||||||
self.images = get_text_objects(self.layout, ltype='image')
|
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
|
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
|
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
|
||||||
|
|
@ -14,14 +14,25 @@ import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
from ..utils import (
|
||||||
merge_close_lines, get_table_index, compute_accuracy,
|
scale_image,
|
||||||
compute_whitespace)
|
scale_pdf,
|
||||||
from ..image_processing import (adaptive_threshold, find_lines,
|
segments_in_bbox,
|
||||||
find_contours, find_joints)
|
text_in_bbox,
|
||||||
|
merge_close_lines,
|
||||||
|
get_table_index,
|
||||||
|
compute_accuracy,
|
||||||
|
compute_whitespace,
|
||||||
|
)
|
||||||
|
from ..image_processing import (
|
||||||
|
adaptive_threshold,
|
||||||
|
find_lines,
|
||||||
|
find_contours,
|
||||||
|
find_joints,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger("camelot")
|
||||||
|
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
|
|
@ -83,11 +94,26 @@ class Lattice(BaseParser):
|
||||||
Resolution used for PDF to PNG conversion.
|
Resolution used for PDF to PNG conversion.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_regions=None, table_areas=None, process_background=False,
|
|
||||||
line_scale=15, copy_text=None, shift_text=['l', 't'],
|
def __init__(
|
||||||
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
self,
|
||||||
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
table_regions=None,
|
||||||
iterations=0, resolution=300, **kwargs):
|
table_areas=None,
|
||||||
|
process_background=False,
|
||||||
|
line_scale=15,
|
||||||
|
copy_text=None,
|
||||||
|
shift_text=["l", "t"],
|
||||||
|
split_text=False,
|
||||||
|
flag_size=False,
|
||||||
|
strip_text="",
|
||||||
|
line_tol=2,
|
||||||
|
joint_tol=2,
|
||||||
|
threshold_blocksize=15,
|
||||||
|
threshold_constant=-2,
|
||||||
|
iterations=0,
|
||||||
|
resolution=300,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
|
|
@ -130,19 +156,19 @@ class Lattice(BaseParser):
|
||||||
indices = []
|
indices = []
|
||||||
for r_idx, c_idx, text in idx:
|
for r_idx, c_idx, text in idx:
|
||||||
for d in shift_text:
|
for d in shift_text:
|
||||||
if d == 'l':
|
if d == "l":
|
||||||
if t.cells[r_idx][c_idx].hspan:
|
if t.cells[r_idx][c_idx].hspan:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
while not t.cells[r_idx][c_idx].left:
|
||||||
c_idx -= 1
|
c_idx -= 1
|
||||||
if d == 'r':
|
if d == "r":
|
||||||
if t.cells[r_idx][c_idx].hspan:
|
if t.cells[r_idx][c_idx].hspan:
|
||||||
while not t.cells[r_idx][c_idx].right:
|
while not t.cells[r_idx][c_idx].right:
|
||||||
c_idx += 1
|
c_idx += 1
|
||||||
if d == 't':
|
if d == "t":
|
||||||
if t.cells[r_idx][c_idx].vspan:
|
if t.cells[r_idx][c_idx].vspan:
|
||||||
while not t.cells[r_idx][c_idx].top:
|
while not t.cells[r_idx][c_idx].top:
|
||||||
r_idx -= 1
|
r_idx -= 1
|
||||||
if d == 'b':
|
if d == "b":
|
||||||
if t.cells[r_idx][c_idx].vspan:
|
if t.cells[r_idx][c_idx].vspan:
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
while not t.cells[r_idx][c_idx].bottom:
|
||||||
r_idx += 1
|
r_idx += 1
|
||||||
|
|
@ -171,13 +197,13 @@ class Lattice(BaseParser):
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(t.cells[i])):
|
||||||
if t.cells[i][j].text.strip() == '':
|
if t.cells[i][j].text.strip() == "":
|
||||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
||||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
t.cells[i][j].text = t.cells[i][j - 1].text
|
||||||
elif f == "v":
|
elif f == "v":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(t.cells[i])):
|
||||||
if t.cells[i][j].text.strip() == '':
|
if t.cells[i][j].text.strip() == "":
|
||||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||||
return t
|
return t
|
||||||
|
|
@ -185,11 +211,12 @@ class Lattice(BaseParser):
|
||||||
def _generate_image(self):
|
def _generate_image(self):
|
||||||
from ..ext.ghostscript import Ghostscript
|
from ..ext.ghostscript import Ghostscript
|
||||||
|
|
||||||
self.imagename = ''.join([self.rootname, '.png'])
|
self.imagename = "".join([self.rootname, ".png"])
|
||||||
gs_call = '-q -sDEVICE=png16m -o {} -r300 {}'.format(
|
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
||||||
self.imagename, self.filename)
|
self.imagename, self.filename
|
||||||
|
)
|
||||||
gs_call = gs_call.encode().split()
|
gs_call = gs_call.encode().split()
|
||||||
null = open(os.devnull, 'wb')
|
null = open(os.devnull, "wb")
|
||||||
with Ghostscript(*gs_call, stdout=null) as gs:
|
with Ghostscript(*gs_call, stdout=null) as gs:
|
||||||
pass
|
pass
|
||||||
null.close()
|
null.close()
|
||||||
|
|
@ -208,8 +235,11 @@ class Lattice(BaseParser):
|
||||||
return scaled_areas
|
return scaled_areas
|
||||||
|
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.image, self.threshold = adaptive_threshold(
|
||||||
self.imagename, process_background=self.process_background,
|
self.imagename,
|
||||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
process_background=self.process_background,
|
||||||
|
blocksize=self.threshold_blocksize,
|
||||||
|
c=self.threshold_constant,
|
||||||
|
)
|
||||||
|
|
||||||
image_width = self.image.shape[1]
|
image_width = self.image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.image.shape[0]
|
||||||
|
|
@ -226,21 +256,35 @@ class Lattice(BaseParser):
|
||||||
regions = scale_areas(self.table_regions)
|
regions = scale_areas(self.table_regions)
|
||||||
|
|
||||||
vertical_mask, vertical_segments = find_lines(
|
vertical_mask, vertical_segments = find_lines(
|
||||||
self.threshold, regions=regions, direction='vertical',
|
self.threshold,
|
||||||
line_scale=self.line_scale, iterations=self.iterations)
|
regions=regions,
|
||||||
|
direction="vertical",
|
||||||
|
line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations,
|
||||||
|
)
|
||||||
horizontal_mask, horizontal_segments = find_lines(
|
horizontal_mask, horizontal_segments = find_lines(
|
||||||
self.threshold, regions=regions, direction='horizontal',
|
self.threshold,
|
||||||
line_scale=self.line_scale, iterations=self.iterations)
|
regions=regions,
|
||||||
|
direction="horizontal",
|
||||||
|
line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations,
|
||||||
|
)
|
||||||
|
|
||||||
contours = find_contours(vertical_mask, horizontal_mask)
|
contours = find_contours(vertical_mask, horizontal_mask)
|
||||||
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
||||||
else:
|
else:
|
||||||
vertical_mask, vertical_segments = find_lines(
|
vertical_mask, vertical_segments = find_lines(
|
||||||
self.threshold, direction='vertical', line_scale=self.line_scale,
|
self.threshold,
|
||||||
iterations=self.iterations)
|
direction="vertical",
|
||||||
|
line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations,
|
||||||
|
)
|
||||||
horizontal_mask, horizontal_segments = find_lines(
|
horizontal_mask, horizontal_segments = find_lines(
|
||||||
self.threshold, direction='horizontal', line_scale=self.line_scale,
|
self.threshold,
|
||||||
iterations=self.iterations)
|
direction="horizontal",
|
||||||
|
line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations,
|
||||||
|
)
|
||||||
|
|
||||||
areas = scale_areas(self.table_areas)
|
areas = scale_areas(self.table_areas)
|
||||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||||
|
|
@ -248,18 +292,20 @@ class Lattice(BaseParser):
|
||||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||||
|
|
||||||
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers)
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
t_bbox = {}
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
tk, self.vertical_segments, self.horizontal_segments)
|
tk, self.vertical_segments, self.horizontal_segments
|
||||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
)
|
||||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
||||||
|
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
||||||
|
|
||||||
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
|
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
|
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
self.t_bbox = t_bbox
|
||||||
|
|
||||||
|
|
@ -268,23 +314,19 @@ class Lattice(BaseParser):
|
||||||
cols.extend([tk[0], tk[2]])
|
cols.extend([tk[0], tk[2]])
|
||||||
rows.extend([tk[1], tk[3]])
|
rows.extend([tk[1], tk[3]])
|
||||||
# sort horizontal and vertical segments
|
# sort horizontal and vertical segments
|
||||||
cols = merge_close_lines(
|
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||||
sorted(cols), line_tol=self.line_tol)
|
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
|
||||||
rows = merge_close_lines(
|
|
||||||
sorted(rows, reverse=True), line_tol=self.line_tol)
|
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
cols = [(cols[i], cols[i + 1])
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
for i in range(0, len(cols) - 1)]
|
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||||
rows = [(rows[i], rows[i + 1])
|
|
||||||
for i in range(0, len(rows) - 1)]
|
|
||||||
|
|
||||||
return cols, rows, v_s, h_s
|
return cols, rows, v_s, h_s
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
v_s = kwargs.get('v_s')
|
v_s = kwargs.get("v_s")
|
||||||
h_s = kwargs.get('h_s')
|
h_s = kwargs.get("h_s")
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError('No segments found on {}'.format(self.rootname))
|
raise ValueError("No segments found on {}".format(self.rootname))
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
|
|
@ -297,14 +339,21 @@ class Lattice(BaseParser):
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
# TODO: have a single list in place of two directional ones?
|
# TODO: have a single list in place of two directional ones?
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||||
for direction in ['vertical', 'horizontal']:
|
for direction in ["vertical", "horizontal"]:
|
||||||
for t in self.t_bbox[direction]:
|
for t in self.t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text,
|
table,
|
||||||
flag_size=self.flag_size, strip_text=self.strip_text)
|
t,
|
||||||
|
direction,
|
||||||
|
split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size,
|
||||||
|
strip_text=self.strip_text,
|
||||||
|
)
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
indices = Lattice._reduce_index(
|
||||||
|
table, indices, shift_text=self.shift_text
|
||||||
|
)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
@ -317,11 +366,11 @@ class Lattice(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
table.flavor = 'lattice'
|
table.flavor = "lattice"
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
|
|
@ -337,15 +386,18 @@ class Lattice(BaseParser):
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename, layout_kwargs)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn('{} is image-based, camelot only works on'
|
warnings.warn(
|
||||||
' text-based pages.'.format(os.path.basename(self.rootname)))
|
"{} is image-based, camelot only works on"
|
||||||
|
" text-based pages.".format(os.path.basename(self.rootname))
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn('No tables found on {}'.format(
|
warnings.warn(
|
||||||
os.path.basename(self.rootname)))
|
"No tables found on {}".format(os.path.basename(self.rootname))
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image()
|
||||||
|
|
@ -353,8 +405,9 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, tk in enumerate(sorted(
|
for table_idx, tk in enumerate(
|
||||||
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
|
):
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
table._bbox = tk
|
table._bbox = tk
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,10 @@ import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges, Table
|
from ..core import TextEdges, Table
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
|
||||||
compute_whitespace)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger("camelot")
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -55,9 +54,20 @@ class Stream(BaseParser):
|
||||||
to generate columns.
|
to generate columns.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
|
|
||||||
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
def __init__(
|
||||||
column_tol=0, **kwargs):
|
self,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
split_text=False,
|
||||||
|
flag_size=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=50,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
|
|
@ -150,8 +160,9 @@ class Stream(BaseParser):
|
||||||
else:
|
else:
|
||||||
lower = merged[-1]
|
lower = merged[-1]
|
||||||
if column_tol >= 0:
|
if column_tol >= 0:
|
||||||
if (higher[0] <= lower[1] or
|
if higher[0] <= lower[1] or np.isclose(
|
||||||
np.isclose(higher[0], lower[1], atol=column_tol)):
|
higher[0], lower[1], atol=column_tol
|
||||||
|
):
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
lower_bound = min(lower[0], higher[0])
|
lower_bound = min(lower[0], higher[0])
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
|
@ -186,13 +197,14 @@ class Stream(BaseParser):
|
||||||
List of continuous row y-coordinate tuples.
|
List of continuous row y-coordinate tuples.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
row_mids = [
|
||||||
if len(r) > 0 else 0 for r in rows_grouped]
|
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
|
||||||
|
for r in rows_grouped
|
||||||
|
]
|
||||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||||
rows.insert(0, text_y_max)
|
rows.insert(0, text_y_max)
|
||||||
rows.append(text_y_min)
|
rows.append(text_y_min)
|
||||||
rows = [(rows[i], rows[i + 1])
|
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||||
for i in range(0, len(rows) - 1)]
|
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -217,8 +229,9 @@ class Stream(BaseParser):
|
||||||
if text:
|
if text:
|
||||||
text = Stream._group_rows(text, row_tol=row_tol)
|
text = Stream._group_rows(text, row_tol=row_tol)
|
||||||
elements = [len(r) for r in text]
|
elements = [len(r) for r in text]
|
||||||
new_cols = [(t.x0, t.x1)
|
new_cols = [
|
||||||
for r in text if len(r) == max(elements) for t in r]
|
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
|
||||||
|
]
|
||||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
|
|
@ -243,15 +256,13 @@ class Stream(BaseParser):
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
cols.insert(0, text_x_min)
|
cols.insert(0, text_x_min)
|
||||||
cols.append(text_x_max)
|
cols.append(text_x_max)
|
||||||
cols = [(cols[i], cols[i + 1])
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
for i in range(0, len(cols) - 1)]
|
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
def _validate_columns(self):
|
def _validate_columns(self):
|
||||||
if self.table_areas is not None and self.columns is not None:
|
if self.table_areas is not None and self.columns is not None:
|
||||||
if len(self.table_areas) != len(self.columns):
|
if len(self.table_areas) != len(self.columns):
|
||||||
raise ValueError("Length of table_areas and columns"
|
raise ValueError("Length of table_areas and columns" " should be equal")
|
||||||
" should be equal")
|
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
"""A general implementation of the table detection algorithm
|
"""A general implementation of the table detection algorithm
|
||||||
|
|
@ -309,16 +320,16 @@ class Stream(BaseParser):
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
t_bbox = {}
|
||||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
||||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
||||||
|
|
||||||
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
|
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
|
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
self.t_bbox = t_bbox
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
||||||
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
|
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
|
|
@ -327,7 +338,7 @@ class Stream(BaseParser):
|
||||||
# take (0, pdf_width) by default
|
# take (0, pdf_width) by default
|
||||||
# similar to else condition
|
# similar to else condition
|
||||||
# len can't be 1
|
# len can't be 1
|
||||||
cols = self.columns[table_idx].split(',')
|
cols = self.columns[table_idx].split(",")
|
||||||
cols = [float(c) for c in cols]
|
cols = [float(c) for c in cols]
|
||||||
cols.insert(0, text_x_min)
|
cols.insert(0, text_x_min)
|
||||||
cols.append(text_x_max)
|
cols.append(text_x_max)
|
||||||
|
|
@ -346,20 +357,29 @@ class Stream(BaseParser):
|
||||||
if len(elements):
|
if len(elements):
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
else:
|
else:
|
||||||
warnings.warn("No tables found in table area {}".format(
|
warnings.warn(
|
||||||
table_idx + 1))
|
"No tables found in table area {}".format(table_idx + 1)
|
||||||
|
)
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
right = cols[i][0]
|
right = cols[i][0]
|
||||||
inner_text.extend([t for direction in self.t_bbox
|
inner_text.extend(
|
||||||
for t in self.t_bbox[direction]
|
[
|
||||||
if t.x0 > left and t.x1 < right])
|
t
|
||||||
outer_text = [t for direction in self.t_bbox
|
for direction in self.t_bbox
|
||||||
for t in self.t_bbox[direction]
|
for t in self.t_bbox[direction]
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
if t.x0 > left and t.x1 < right
|
||||||
|
]
|
||||||
|
)
|
||||||
|
outer_text = [
|
||||||
|
t
|
||||||
|
for direction in self.t_bbox
|
||||||
|
for t in self.t_bbox[direction]
|
||||||
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||||
|
]
|
||||||
inner_text.extend(outer_text)
|
inner_text.extend(outer_text)
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
@ -373,11 +393,16 @@ class Stream(BaseParser):
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
# TODO: have a single list in place of two directional ones?
|
# TODO: have a single list in place of two directional ones?
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||||
for direction in ['vertical', 'horizontal']:
|
for direction in ["vertical", "horizontal"]:
|
||||||
for t in self.t_bbox[direction]:
|
for t in self.t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text,
|
table,
|
||||||
flag_size=self.flag_size, strip_text=self.strip_text)
|
t,
|
||||||
|
direction,
|
||||||
|
split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size,
|
||||||
|
strip_text=self.strip_text,
|
||||||
|
)
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
|
|
@ -389,11 +414,11 @@ class Stream(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
table.flavor = 'stream'
|
table.flavor = "stream"
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
|
|
@ -409,23 +434,27 @@ class Stream(BaseParser):
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename, layout_kwargs)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn('{} is image-based, camelot only works on'
|
warnings.warn(
|
||||||
' text-based pages.'.format(os.path.basename(self.rootname)))
|
"{} is image-based, camelot only works on"
|
||||||
|
" text-based pages.".format(os.path.basename(self.rootname))
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn('No tables found on {}'.format(
|
warnings.warn(
|
||||||
os.path.basename(self.rootname)))
|
"No tables found on {}".format(os.path.basename(self.rootname))
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, tk in enumerate(sorted(
|
for table_idx, tk in enumerate(
|
||||||
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
|
):
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
table._bbox = tk
|
table._bbox = tk
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ else:
|
||||||
|
|
||||||
|
|
||||||
class PlotMethods(object):
|
class PlotMethods(object):
|
||||||
def __call__(self, table, kind='text', filename=None):
|
def __call__(self, table, kind="text", filename=None):
|
||||||
"""Plot elements found on PDF page based on kind
|
"""Plot elements found on PDF page based on kind
|
||||||
specified, useful for debugging and playing with different
|
specified, useful for debugging and playing with different
|
||||||
parameters to get the best output.
|
parameters to get the best output.
|
||||||
|
|
@ -31,14 +31,16 @@ class PlotMethods(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not _HAS_MPL:
|
if not _HAS_MPL:
|
||||||
raise ImportError('matplotlib is required for plotting.')
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
|
|
||||||
if table.flavor == 'lattice' and kind in ['textedge']:
|
if table.flavor == "lattice" and kind in ["textedge"]:
|
||||||
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
|
raise NotImplementedError(
|
||||||
kind))
|
"Lattice flavor does not support kind='{}'".format(kind)
|
||||||
elif table.flavor == 'stream' and kind in ['joint', 'line']:
|
)
|
||||||
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
||||||
kind))
|
raise NotImplementedError(
|
||||||
|
"Stream flavor does not support kind='{}'".format(kind)
|
||||||
|
)
|
||||||
|
|
||||||
plot_method = getattr(self, kind)
|
plot_method = getattr(self, kind)
|
||||||
return plot_method(table)
|
return plot_method(table)
|
||||||
|
|
@ -57,18 +59,12 @@ class PlotMethods(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(
|
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
|
||||||
patches.Rectangle(
|
|
||||||
(t[0], t[1]),
|
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
return fig
|
return fig
|
||||||
|
|
@ -87,21 +83,17 @@ class PlotMethods(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
for row in table.cells:
|
for row in table.cells:
|
||||||
for cell in row:
|
for cell in row:
|
||||||
if cell.left:
|
if cell.left:
|
||||||
ax.plot([cell.lb[0], cell.lt[0]],
|
ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
|
||||||
[cell.lb[1], cell.lt[1]])
|
|
||||||
if cell.right:
|
if cell.right:
|
||||||
ax.plot([cell.rb[0], cell.rt[0]],
|
ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
|
||||||
[cell.rb[1], cell.rt[1]])
|
|
||||||
if cell.top:
|
if cell.top:
|
||||||
ax.plot([cell.lt[0], cell.rt[0]],
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||||
[cell.lt[1], cell.rt[1]])
|
|
||||||
if cell.bottom:
|
if cell.bottom:
|
||||||
ax.plot([cell.lb[0], cell.rb[0]],
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||||
[cell.lb[1], cell.rb[1]])
|
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def contour(self, table):
|
def contour(self, table):
|
||||||
|
|
@ -124,7 +116,7 @@ class PlotMethods(object):
|
||||||
img, table_bbox = (None, {table._bbox: None})
|
img, table_bbox = (None, {table._bbox: None})
|
||||||
_FOR_LATTICE = False
|
_FOR_LATTICE = False
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
|
|
@ -133,21 +125,14 @@ class PlotMethods(object):
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]),
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1],
|
|
||||||
color='blue'
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for t in table_bbox.keys():
|
for t in table_bbox.keys():
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]),
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1],
|
|
||||||
fill=False,
|
|
||||||
color='red'
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
|
|
@ -173,25 +158,19 @@ class PlotMethods(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
|
||||||
(t[0], t[1]),
|
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1],
|
|
||||||
color='blue'
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
||||||
for te in table._textedges:
|
for te in table._textedges:
|
||||||
ax.plot([te.x, te.x],
|
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||||
[te.y0, te.y1])
|
|
||||||
|
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
@ -210,14 +189,14 @@ class PlotMethods(object):
|
||||||
"""
|
"""
|
||||||
img, table_bbox = table._image
|
img, table_bbox = table._image
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
for k in table_bbox.keys():
|
for k in table_bbox.keys():
|
||||||
for coord in table_bbox[k]:
|
for coord in table_bbox[k]:
|
||||||
x_coord.append(coord[0])
|
x_coord.append(coord[0])
|
||||||
y_coord.append(coord[1])
|
y_coord.append(coord[1])
|
||||||
ax.plot(x_coord, y_coord, 'ro')
|
ax.plot(x_coord, y_coord, "ro")
|
||||||
ax.imshow(img)
|
ax.imshow(img)
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
@ -235,7 +214,7 @@ class PlotMethods(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
vertical, horizontal = table._segments
|
vertical, horizontal = table._segments
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
|
|
||||||
271
camelot/utils.py
271
camelot/utils.py
|
|
@ -19,8 +19,14 @@ from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||||
from pdfminer.pdfinterp import PDFResourceManager
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
from pdfminer.layout import (
|
||||||
LTTextLineVertical, LTImage)
|
LAParams,
|
||||||
|
LTAnno,
|
||||||
|
LTChar,
|
||||||
|
LTTextLineHorizontal,
|
||||||
|
LTTextLineVertical,
|
||||||
|
LTImage,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PY3 = sys.version_info[0] >= 3
|
PY3 = sys.version_info[0] >= 3
|
||||||
|
|
@ -35,7 +41,7 @@ else:
|
||||||
|
|
||||||
|
|
||||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
_VALID_URLS.discard('')
|
_VALID_URLS.discard("")
|
||||||
|
|
||||||
|
|
||||||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||||
|
|
@ -59,9 +65,11 @@ def is_url(url):
|
||||||
|
|
||||||
|
|
||||||
def random_string(length):
|
def random_string(length):
|
||||||
ret = ''
|
ret = ""
|
||||||
while length:
|
while length:
|
||||||
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
|
ret += random.choice(
|
||||||
|
string.digits + string.ascii_lowercase + string.ascii_uppercase
|
||||||
|
)
|
||||||
length -= 1
|
length -= 1
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
@ -79,14 +87,14 @@ def download_url(url):
|
||||||
Temporary filepath.
|
Temporary filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
filename = '{}.pdf'.format(random_string(6))
|
filename = "{}.pdf".format(random_string(6))
|
||||||
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
|
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||||
obj = urlopen(url)
|
obj = urlopen(url)
|
||||||
if PY3:
|
if PY3:
|
||||||
content_type = obj.info().get_content_type()
|
content_type = obj.info().get_content_type()
|
||||||
else:
|
else:
|
||||||
content_type = obj.info().getheader('Content-Type')
|
content_type = obj.info().getheader("Content-Type")
|
||||||
if content_type != 'application/pdf':
|
if content_type != "application/pdf":
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
f.write(obj.read())
|
f.write(obj.read())
|
||||||
filepath = os.path.join(os.path.dirname(f.name), filename)
|
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||||
|
|
@ -94,39 +102,38 @@ def download_url(url):
|
||||||
return filepath
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
stream_kwargs = [
|
stream_kwargs = ["columns", "row_tol", "column_tol"]
|
||||||
'columns',
|
|
||||||
'row_tol',
|
|
||||||
'column_tol'
|
|
||||||
]
|
|
||||||
lattice_kwargs = [
|
lattice_kwargs = [
|
||||||
'process_background',
|
"process_background",
|
||||||
'line_scale',
|
"line_scale",
|
||||||
'copy_text',
|
"copy_text",
|
||||||
'shift_text',
|
"shift_text",
|
||||||
'line_tol',
|
"line_tol",
|
||||||
'joint_tol',
|
"joint_tol",
|
||||||
'threshold_blocksize',
|
"threshold_blocksize",
|
||||||
'threshold_constant',
|
"threshold_constant",
|
||||||
'iterations'
|
"iterations",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, flavor='lattice'):
|
def validate_input(kwargs, flavor="lattice"):
|
||||||
def check_intersection(parser_kwargs, input_kwargs):
|
def check_intersection(parser_kwargs, input_kwargs):
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
if isec:
|
if isec:
|
||||||
raise ValueError("{} cannot be used with flavor='{}'".format(
|
raise ValueError(
|
||||||
",".join(sorted(isec)), flavor))
|
"{} cannot be used with flavor='{}'".format(
|
||||||
|
",".join(sorted(isec)), flavor
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if flavor == 'lattice':
|
if flavor == "lattice":
|
||||||
check_intersection(stream_kwargs, kwargs)
|
check_intersection(stream_kwargs, kwargs)
|
||||||
else:
|
else:
|
||||||
check_intersection(lattice_kwargs, kwargs)
|
check_intersection(lattice_kwargs, kwargs)
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, flavor='lattice'):
|
def remove_extra(kwargs, flavor="lattice"):
|
||||||
if flavor == 'lattice':
|
if flavor == "lattice":
|
||||||
for key in kwargs.keys():
|
for key in kwargs.keys():
|
||||||
if key in stream_kwargs:
|
if key in stream_kwargs:
|
||||||
kwargs.pop(key)
|
kwargs.pop(key)
|
||||||
|
|
@ -256,15 +263,19 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
v_segments_new = []
|
v_segments_new = []
|
||||||
for v in v_segments:
|
for v in v_segments:
|
||||||
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
|
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
|
||||||
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
|
y1, y2 = (
|
||||||
abs(translate(-img_y, v[3])), scaling_factor_y)
|
scale(abs(translate(-img_y, v[1])), scaling_factor_y),
|
||||||
|
scale(abs(translate(-img_y, v[3])), scaling_factor_y),
|
||||||
|
)
|
||||||
v_segments_new.append((x1, y1, x2, y2))
|
v_segments_new.append((x1, y1, x2, y2))
|
||||||
|
|
||||||
h_segments_new = []
|
h_segments_new = []
|
||||||
for h in h_segments:
|
for h in h_segments:
|
||||||
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
|
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
|
||||||
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
|
y1, y2 = (
|
||||||
abs(translate(-img_y, h[3])), scaling_factor_y)
|
scale(abs(translate(-img_y, h[1])), scaling_factor_y),
|
||||||
|
scale(abs(translate(-img_y, h[3])), scaling_factor_y),
|
||||||
|
)
|
||||||
h_segments_new.append((x1, y1, x2, y2))
|
h_segments_new.append((x1, y1, x2, y2))
|
||||||
|
|
||||||
return tables_new, v_segments_new, h_segments_new
|
return tables_new, v_segments_new, h_segments_new
|
||||||
|
|
@ -291,13 +302,13 @@ def get_rotation(chars, horizontal_text, vertical_text):
|
||||||
rotated 90 degree clockwise.
|
rotated 90 degree clockwise.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
rotation = ''
|
rotation = ""
|
||||||
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||||
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
||||||
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -325,10 +336,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
v_s = [
|
||||||
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
|
v
|
||||||
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
|
for v in v_segments
|
||||||
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
|
||||||
|
]
|
||||||
|
h_s = [
|
||||||
|
h
|
||||||
|
for h in h_segments
|
||||||
|
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
|
||||||
|
]
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -351,9 +368,12 @@ def text_in_bbox(bbox, text):
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
t_bbox = [
|
||||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
t
|
||||||
<= rt[1] + 2]
|
for t in text
|
||||||
|
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
|
||||||
|
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
|
||||||
|
]
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -390,7 +410,7 @@ def merge_close_lines(ar, line_tol=2):
|
||||||
# (inspired from sklearn.pipeline.Pipeline)
|
# (inspired from sklearn.pipeline.Pipeline)
|
||||||
|
|
||||||
|
|
||||||
def flag_font_size(textline, direction, strip_text=''):
|
def flag_font_size(textline, direction, strip_text=""):
|
||||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||||
May give false positives.
|
May give false positives.
|
||||||
|
|
||||||
|
|
@ -409,10 +429,18 @@ def flag_font_size(textline, direction, strip_text=''):
|
||||||
fstring : string
|
fstring : string
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if direction == 'horizontal':
|
if direction == "horizontal":
|
||||||
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
d = [
|
||||||
elif direction == 'vertical':
|
(t.get_text(), np.round(t.height, decimals=6))
|
||||||
d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
for t in textline
|
||||||
|
if not isinstance(t, LTAnno)
|
||||||
|
]
|
||||||
|
elif direction == "vertical":
|
||||||
|
d = [
|
||||||
|
(t.get_text(), np.round(t.width, decimals=6))
|
||||||
|
for t in textline
|
||||||
|
if not isinstance(t, LTAnno)
|
||||||
|
]
|
||||||
l = [np.round(size, decimals=6) for text, size in d]
|
l = [np.round(size, decimals=6) for text, size in d]
|
||||||
if len(set(l)) > 1:
|
if len(set(l)) > 1:
|
||||||
flist = []
|
flist = []
|
||||||
|
|
@ -420,21 +448,21 @@ def flag_font_size(textline, direction, strip_text=''):
|
||||||
for key, chars in groupby(d, itemgetter(1)):
|
for key, chars in groupby(d, itemgetter(1)):
|
||||||
if key == min_size:
|
if key == min_size:
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
if ''.join(fchars).strip():
|
if "".join(fchars).strip():
|
||||||
fchars.insert(0, '<s>')
|
fchars.insert(0, "<s>")
|
||||||
fchars.append('</s>')
|
fchars.append("</s>")
|
||||||
flist.append(''.join(fchars))
|
flist.append("".join(fchars))
|
||||||
else:
|
else:
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
if ''.join(fchars).strip():
|
if "".join(fchars).strip():
|
||||||
flist.append(''.join(fchars))
|
flist.append("".join(fchars))
|
||||||
fstring = ''.join(flist).strip(strip_text)
|
fstring = "".join(flist).strip(strip_text)
|
||||||
else:
|
else:
|
||||||
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
|
fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
|
||||||
return fstring
|
return fstring
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
multiple rows/columns.
|
multiple rows/columns.
|
||||||
|
|
||||||
|
|
@ -464,19 +492,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||||
cut_text = []
|
cut_text = []
|
||||||
bbox = textline.bbox
|
bbox = textline.bbox
|
||||||
try:
|
try:
|
||||||
if direction == 'horizontal' and not textline.is_empty():
|
if direction == "horizontal" and not textline.is_empty():
|
||||||
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
|
x_overlap = [
|
||||||
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
|
i
|
||||||
|
for i, x in enumerate(table.cols)
|
||||||
|
if x[0] <= bbox[2] and bbox[0] <= x[1]
|
||||||
|
]
|
||||||
|
r_idx = [
|
||||||
|
j
|
||||||
|
for j, r in enumerate(table.rows)
|
||||||
|
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
|
||||||
|
]
|
||||||
r = r_idx[0]
|
r = r_idx[0]
|
||||||
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
|
x_cuts = [
|
||||||
|
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
|
||||||
|
]
|
||||||
if not x_cuts:
|
if not x_cuts:
|
||||||
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||||
for obj in textline._objs:
|
for obj in textline._objs:
|
||||||
row = table.rows[r]
|
row = table.rows[r]
|
||||||
for cut in x_cuts:
|
for cut in x_cuts:
|
||||||
if isinstance(obj, LTChar):
|
if isinstance(obj, LTChar):
|
||||||
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
if (
|
||||||
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
|
||||||
|
and (obj.x0 + obj.x1) / 2 <= cut[1]
|
||||||
|
):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
@ -485,19 +525,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||||
cut_text.append((r, cut[0] + 1, obj))
|
cut_text.append((r, cut[0] + 1, obj))
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
elif direction == 'vertical' and not textline.is_empty():
|
elif direction == "vertical" and not textline.is_empty():
|
||||||
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
y_overlap = [
|
||||||
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
j
|
||||||
|
for j, y in enumerate(table.rows)
|
||||||
|
if y[1] <= bbox[3] and bbox[1] <= y[0]
|
||||||
|
]
|
||||||
|
c_idx = [
|
||||||
|
i
|
||||||
|
for i, c in enumerate(table.cols)
|
||||||
|
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
|
||||||
|
]
|
||||||
c = c_idx[0]
|
c = c_idx[0]
|
||||||
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
|
y_cuts = [
|
||||||
|
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
|
||||||
|
]
|
||||||
if not y_cuts:
|
if not y_cuts:
|
||||||
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||||
for obj in textline._objs:
|
for obj in textline._objs:
|
||||||
col = table.cols[c]
|
col = table.cols[c]
|
||||||
for cut in y_cuts:
|
for cut in y_cuts:
|
||||||
if isinstance(obj, LTChar):
|
if isinstance(obj, LTChar):
|
||||||
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
|
if (
|
||||||
(obj.y0 + obj.y1) / 2 >= cut[1]):
|
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
|
||||||
|
and (obj.y0 + obj.y1) / 2 >= cut[1]
|
||||||
|
):
|
||||||
cut_text.append((cut[0], c, obj))
|
cut_text.append((cut[0], c, obj))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
@ -511,15 +563,24 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||||
grouped_chars = []
|
grouped_chars = []
|
||||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||||
if flag_size:
|
if flag_size:
|
||||||
grouped_chars.append((key[0], key[1],
|
grouped_chars.append(
|
||||||
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
|
(
|
||||||
|
key[0],
|
||||||
|
key[1],
|
||||||
|
flag_font_size(
|
||||||
|
[t[2] for t in chars], direction, strip_text=strip_text
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
gchars = [t[2].get_text() for t in chars]
|
gchars = [t[2].get_text() for t in chars]
|
||||||
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
|
grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
|
||||||
return grouped_chars
|
return grouped_chars
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
|
def get_table_index(
|
||||||
|
table, t, direction, split_text=False, flag_size=False, strip_text=""
|
||||||
|
):
|
||||||
"""Gets indices of the table cell where given text object lies by
|
"""Gets indices of the table cell where given text object lies by
|
||||||
comparing their y and x-coordinates.
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
|
|
@ -558,8 +619,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
||||||
"""
|
"""
|
||||||
r_idx, c_idx = [-1] * 2
|
r_idx, c_idx = [-1] * 2
|
||||||
for r in range(len(table.rows)):
|
for r in range(len(table.rows)):
|
||||||
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
|
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
|
||||||
(t.y0 + t.y1) / 2.0 > table.rows[r][1]):
|
r
|
||||||
|
][1]:
|
||||||
lt_col_overlap = []
|
lt_col_overlap = []
|
||||||
for c in table.cols:
|
for c in table.cols:
|
||||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||||
|
|
@ -569,11 +631,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
||||||
else:
|
else:
|
||||||
lt_col_overlap.append(-1)
|
lt_col_overlap.append(-1)
|
||||||
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
|
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
|
||||||
text = t.get_text().strip('\n')
|
text = t.get_text().strip("\n")
|
||||||
text_range = (t.x0, t.x1)
|
text_range = (t.x0, t.x1)
|
||||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||||
warnings.warn("{} {} does not lie in column range {}".format(
|
warnings.warn(
|
||||||
text, text_range, col_range))
|
"{} {} does not lie in column range {}".format(
|
||||||
|
text, text_range, col_range
|
||||||
|
)
|
||||||
|
)
|
||||||
r_idx = r
|
r_idx = r
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
break
|
break
|
||||||
|
|
@ -594,10 +659,24 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
|
return (
|
||||||
|
split_textline(
|
||||||
|
table, t, direction, flag_size=flag_size, strip_text=strip_text
|
||||||
|
),
|
||||||
|
error,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if flag_size:
|
if flag_size:
|
||||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
|
return (
|
||||||
|
[
|
||||||
|
(
|
||||||
|
r_idx,
|
||||||
|
c_idx,
|
||||||
|
flag_font_size(t._objs, direction, strip_text=strip_text),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
error,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
||||||
|
|
||||||
|
|
@ -650,14 +729,20 @@ def compute_whitespace(d):
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
r_nempty_cells, c_nempty_cells = [], []
|
||||||
for i in d:
|
for i in d:
|
||||||
for j in i:
|
for j in i:
|
||||||
if j.strip() == '':
|
if j.strip() == "":
|
||||||
whitespace += 1
|
whitespace += 1
|
||||||
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
||||||
return whitespace
|
return whitespace
|
||||||
|
|
||||||
|
|
||||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
def get_page_layout(
|
||||||
detect_vertical=True, all_texts=True):
|
filename,
|
||||||
|
char_margin=1.0,
|
||||||
|
line_margin=0.5,
|
||||||
|
word_margin=0.1,
|
||||||
|
detect_vertical=True,
|
||||||
|
all_texts=True,
|
||||||
|
):
|
||||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||||
of kwargs.
|
of kwargs.
|
||||||
|
|
@ -680,16 +765,18 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
Dimension of pdf page in the form (width, height).
|
Dimension of pdf page in the form (width, height).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, "rb") as f:
|
||||||
parser = PDFParser(f)
|
parser = PDFParser(f)
|
||||||
document = PDFDocument(parser)
|
document = PDFDocument(parser)
|
||||||
if not document.is_extractable:
|
if not document.is_extractable:
|
||||||
raise PDFTextExtractionNotAllowed
|
raise PDFTextExtractionNotAllowed
|
||||||
laparams = LAParams(char_margin=char_margin,
|
laparams = LAParams(
|
||||||
line_margin=line_margin,
|
char_margin=char_margin,
|
||||||
word_margin=word_margin,
|
line_margin=line_margin,
|
||||||
detect_vertical=detect_vertical,
|
word_margin=word_margin,
|
||||||
all_texts=all_texts)
|
detect_vertical=detect_vertical,
|
||||||
|
all_texts=all_texts,
|
||||||
|
)
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager()
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
@ -721,13 +808,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if ltype == 'char':
|
if ltype == "char":
|
||||||
LTObject = LTChar
|
LTObject = LTChar
|
||||||
elif ltype == 'image':
|
elif ltype == "image":
|
||||||
LTObject = LTImage
|
LTObject = LTImage
|
||||||
elif ltype == 'horizontal_text':
|
elif ltype == "horizontal_text":
|
||||||
LTObject = LTTextLineHorizontal
|
LTObject = LTTextLineHorizontal
|
||||||
elif ltype == 'vertical_text':
|
elif ltype == "vertical_text":
|
||||||
LTObject = LTTextLineVertical
|
LTObject = LTTextLineVertical
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue