Blacken code

pull/1/head
Vinayak Mehta 2019-07-03 22:04:19 +05:30
parent 27d55d056c
commit 2115a0e177
15 changed files with 892 additions and 551 deletions

View File

@ -9,8 +9,8 @@ from .io import read_pdf
from .plotting import PlotMethods from .plotting import PlotMethods
def _write_usage(self, prog, args='', prefix='Usage: '): def _write_usage(self, prog, args="", prefix="Usage: "):
return self._write_usage('camelot', args, prefix=prefix) return self._write_usage("camelot", args, prefix=prefix)
# monkey patch click.HelpFormatter # monkey patch click.HelpFormatter
@ -18,10 +18,10 @@ HelpFormatter._write_usage = HelpFormatter.write_usage
HelpFormatter.write_usage = _write_usage HelpFormatter.write_usage = _write_usage
# set up logging # set up logging
logger = logging.getLogger('camelot') logger = logging.getLogger("camelot")
format_string = '%(asctime)s - %(levelname)s - %(message)s' format_string = "%(asctime)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
handler = logging.StreamHandler() handler = logging.StreamHandler()
handler.setFormatter(formatter) handler.setFormatter(formatter)

View File

@ -3,7 +3,7 @@
from __future__ import absolute_import from __future__ import absolute_import
__all__ = ('main',) __all__ = ("main",)
def main(): def main():

View File

@ -1,23 +1,23 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
VERSION = (0, 7, 2) VERSION = (0, 7, 2)
PRERELEASE = None # alpha, beta or rc PRERELEASE = None # alpha, beta or rc
REVISION = None REVISION = None
def generate_version(version, prerelease=None, revision=None): def generate_version(version, prerelease=None, revision=None):
version_parts = ['.'.join(map(str, version))] version_parts = [".".join(map(str, version))]
if prerelease is not None: if prerelease is not None:
version_parts.append('-{}'.format(prerelease)) version_parts.append("-{}".format(prerelease))
if revision is not None: if revision is not None:
version_parts.append('.{}'.format(revision)) version_parts.append(".{}".format(revision))
return ''.join(version_parts) return "".join(version_parts)
__title__ = 'camelot-py' __title__ = "camelot-py"
__description__ = 'PDF Table Extraction for Humans.' __description__ = "PDF Table Extraction for Humans."
__url__ = 'http://camelot-py.readthedocs.io/' __url__ = "http://camelot-py.readthedocs.io/"
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION) __version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
__author__ = 'Vinayak Mehta' __author__ = "Vinayak Mehta"
__author_email__ = 'vmehta94@gmail.com' __author_email__ = "vmehta94@gmail.com"
__license__ = 'MIT License' __license__ = "MIT License"

View File

@ -3,6 +3,7 @@
import logging import logging
import click import click
try: try:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
except ImportError: except ImportError:
@ -13,7 +14,7 @@ else:
from . import __version__, read_pdf, plot from . import __version__, read_pdf, plot
logger = logging.getLogger('camelot') logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
@ -30,23 +31,47 @@ pass_config = click.make_pass_decorator(Config)
@click.group() @click.group()
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.') @click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' @click.option(
' Example: 1,3,4 or 1,4-end or all.') "-p",
@click.option('-pw', '--password', help='Password for decryption.') "--pages",
@click.option('-o', '--output', help='Output file path.') default="1",
@click.option('-f', '--format', help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']), )
help='Output file format.') @click.option("-pw", "--password", help="Password for decryption.")
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') @click.option("-o", "--output", help="Output file path.")
@click.option('-split', '--split_text', is_flag=True, @click.option(
help='Split text that spans across multiple cells.') "-f",
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' "--format",
' font size. Useful to detect super/subscripts.') type=click.Choice(["csv", "json", "excel", "html", "sqlite"]),
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before' help="Output file format.",
' assigning it to a cell.') )
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), @click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
help='PDFMiner char_margin, line_margin and word_margin.') @click.option(
"-split",
"--split_text",
is_flag=True,
help="Split text that spans across multiple cells.",
)
@click.option(
"-flag",
"--flag_size",
is_flag=True,
help="Flag text based on" " font size. Useful to detect super/subscripts.",
)
@click.option(
"-strip",
"--strip_text",
help="Characters that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
"-M",
"--margins",
nargs=3,
default=(1.0, 0.5, 0.1),
help="PDFMiner char_margin, line_margin and word_margin.",
)
@click.pass_context @click.pass_context
def cli(ctx, *args, **kwargs): def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans""" """Camelot: PDF Table Extraction for Humans"""
@ -55,79 +80,131 @@ def cli(ctx, *args, **kwargs):
ctx.obj.set_config(key, value) ctx.obj.set_config(key, value)
@cli.command('lattice') @cli.command("lattice")
@click.option('-R', '--table_regions', default=[], multiple=True, @click.option(
help='Page regions to analyze. Example: x1,y1,x2,y2' "-R",
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') "--table_regions",
@click.option('-T', '--table_areas', default=[], multiple=True, default=[],
help='Table areas to process. Example: x1,y1,x2,y2' multiple=True,
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') help="Page regions to analyze. Example: x1,y1,x2,y2"
@click.option('-back', '--process_background', is_flag=True, " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
help='Process background lines.') )
@click.option('-scale', '--line_scale', default=15, @click.option(
help='Line size scaling factor. The larger the value,' "-T",
' the smaller the detected lines.') "--table_areas",
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), default=[],
multiple=True, help='Direction in which text in a spanning cell' multiple=True,
' will be copied over.') help="Table areas to process. Example: x1,y1,x2,y2"
@click.option('-shift', '--shift_text', default=['l', 't'], " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, )
help='Direction in which text in a spanning cell will flow.') @click.option(
@click.option('-l', '--line_tol', default=2, "-back", "--process_background", is_flag=True, help="Process background lines."
help='Tolerance parameter used to merge close vertical' )
' and horizontal lines.') @click.option(
@click.option('-j', '--joint_tol', default=2, "-scale",
help='Tolerance parameter used to decide whether' "--line_scale",
' the detected lines and points lie close to each other.') default=15,
@click.option('-block', '--threshold_blocksize', default=15, help="Line size scaling factor. The larger the value,"
help='For adaptive thresholding, size of a pixel' " the smaller the detected lines.",
' neighborhood that is used to calculate a threshold value for' )
' the pixel. Example: 3, 5, 7, and so on.') @click.option(
@click.option('-const', '--threshold_constant', default=-2, "-copy",
help='For adaptive thresholding, constant subtracted' "--copy_text",
' from the mean or weighted mean. Normally, it is positive but' default=[],
' may be zero or negative as well.') type=click.Choice(["h", "v"]),
@click.option('-I', '--iterations', default=0, multiple=True,
help='Number of times for erosion/dilation will be applied.') help="Direction in which text in a spanning cell" " will be copied over.",
@click.option('-res', '--resolution', default=300, )
help='Resolution used for PDF to PNG conversion.') @click.option(
@click.option('-plot', '--plot_type', "-shift",
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), "--shift_text",
help='Plot elements found on PDF page for visual debugging.') default=["l", "t"],
@click.argument('filepath', type=click.Path(exists=True)) type=click.Choice(["", "l", "r", "t", "b"]),
multiple=True,
help="Direction in which text in a spanning cell will flow.",
)
@click.option(
"-l",
"--line_tol",
default=2,
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
)
@click.option(
"-j",
"--joint_tol",
default=2,
help="Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.",
)
@click.option(
"-block",
"--threshold_blocksize",
default=15,
help="For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel. Example: 3, 5, 7, and so on.",
)
@click.option(
"-const",
"--threshold_constant",
default=-2,
help="For adaptive thresholding, constant subtracted"
" from the mean or weighted mean. Normally, it is positive but"
" may be zero or negative as well.",
)
@click.option(
"-I",
"--iterations",
default=0,
help="Number of times for erosion/dilation will be applied.",
)
@click.option(
"-res",
"--resolution",
default=300,
help="Resolution used for PDF to PNG conversion.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "joint", "line"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config @pass_config
def lattice(c, *args, **kwargs): def lattice(c, *args, **kwargs):
"""Use lines between text to parse the table.""" """Use lines between text to parse the table."""
conf = c.config conf = c.config
pages = conf.pop('pages') pages = conf.pop("pages")
output = conf.pop('output') output = conf.pop("output")
f = conf.pop('format') f = conf.pop("format")
compress = conf.pop('zip') compress = conf.pop("zip")
quiet = conf.pop('quiet') quiet = conf.pop("quiet")
plot_type = kwargs.pop('plot_type') plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop('filepath') filepath = kwargs.pop("filepath")
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions']) table_regions = list(kwargs["table_regions"])
kwargs['table_regions'] = None if not table_regions else table_regions kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs["table_areas"])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs["table_areas"] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text']) copy_text = list(kwargs["copy_text"])
kwargs['copy_text'] = None if not copy_text else copy_text kwargs["copy_text"] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text']) kwargs["shift_text"] = list(kwargs["shift_text"])
if plot_type is not None: if plot_type is not None:
if not _HAS_MPL: if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.') raise ImportError("matplotlib is required for plotting.")
else: else:
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError("Please specify output file path using --output")
if f is None: if f is None:
raise click.UsageError('Please specify output file format using --format') raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(filepath, pages=pages, flavor='lattice', tables = read_pdf(
suppress_stdout=quiet, **kwargs) filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
click.echo('Found {} tables'.format(tables.n)) )
click.echo("Found {} tables".format(tables.n))
if plot_type is not None: if plot_type is not None:
for table in tables: for table in tables:
plot(table, kind=plot_type) plot(table, kind=plot_type)
@ -136,57 +213,89 @@ def lattice(c, *args, **kwargs):
tables.export(output, f=f, compress=compress) tables.export(output, f=f, compress=compress)
@cli.command('stream') @cli.command("stream")
@click.option('-R', '--table_regions', default=[], multiple=True, @click.option(
help='Page regions to analyze. Example: x1,y1,x2,y2' "-R",
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') "--table_regions",
@click.option('-T', '--table_areas', default=[], multiple=True, default=[],
help='Table areas to process. Example: x1,y1,x2,y2' multiple=True,
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') help="Page regions to analyze. Example: x1,y1,x2,y2"
@click.option('-C', '--columns', default=[], multiple=True, " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
help='X coordinates of column separators.') )
@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter' @click.option(
' for extending textedges vertically.') "-T",
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter' "--table_areas",
' used to combine text vertically, to generate rows.') default=[],
@click.option('-c', '--column_tol', default=0, help='Tolerance parameter' multiple=True,
' used to combine text horizontally, to generate columns.') help="Table areas to process. Example: x1,y1,x2,y2"
@click.option('-plot', '--plot_type', " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
type=click.Choice(['text', 'grid', 'contour', 'textedge']), )
help='Plot elements found on PDF page for visual debugging.') @click.option(
@click.argument('filepath', type=click.Path(exists=True)) "-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config @pass_config
def stream(c, *args, **kwargs): def stream(c, *args, **kwargs):
"""Use spaces between text to parse the table.""" """Use spaces between text to parse the table."""
conf = c.config conf = c.config
pages = conf.pop('pages') pages = conf.pop("pages")
output = conf.pop('output') output = conf.pop("output")
f = conf.pop('format') f = conf.pop("format")
compress = conf.pop('zip') compress = conf.pop("zip")
quiet = conf.pop('quiet') quiet = conf.pop("quiet")
plot_type = kwargs.pop('plot_type') plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop('filepath') filepath = kwargs.pop("filepath")
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions']) table_regions = list(kwargs["table_regions"])
kwargs['table_regions'] = None if not table_regions else table_regions kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs["table_areas"])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs['columns']) columns = list(kwargs["columns"])
kwargs['columns'] = None if not columns else columns kwargs["columns"] = None if not columns else columns
if plot_type is not None: if plot_type is not None:
if not _HAS_MPL: if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.') raise ImportError("matplotlib is required for plotting.")
else: else:
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError("Please specify output file path using --output")
if f is None: if f is None:
raise click.UsageError('Please specify output file format using --format') raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(filepath, pages=pages, flavor='stream', tables = read_pdf(
suppress_stdout=quiet, **kwargs) filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
click.echo('Found {} tables'.format(tables.n)) )
click.echo("Found {} tables".format(tables.n))
if plot_type is not None: if plot_type is not None:
for table in tables: for table in tables:
plot(table, kind=plot_type) plot(table, kind=plot_type)

View File

@ -42,7 +42,8 @@ class TextEdge(object):
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
""" """
def __init__(self, x, y0, y1, align='left'):
def __init__(self, x, y0, y1, align="left"):
self.x = x self.x = x
self.y0 = y0 self.y0 = y0
self.y1 = y1 self.y1 = y1
@ -51,8 +52,13 @@ class TextEdge(object):
self.is_valid = False self.is_valid = False
def __repr__(self): def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format( return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) round(self.x, 2),
round(self.y0, 2),
round(self.y1, 2),
self.align,
self.is_valid,
)
def update_coords(self, x, y0, edge_tol=50): def update_coords(self, x, y0, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets """Updates the text edge's x and bottom y coordinates and sets
@ -73,9 +79,10 @@ class TextEdges(object):
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
""" """
def __init__(self, edge_tol=50): def __init__(self, edge_tol=50):
self.edge_tol = edge_tol self.edge_tol = edge_tol
self._textedges = {'left': [], 'right': [], 'middle': []} self._textedges = {"left": [], "right": [], "middle": []}
@staticmethod @staticmethod
def get_x_coord(textline, align): def get_x_coord(textline, align):
@ -85,7 +92,7 @@ class TextEdges(object):
x_left = textline.x0 x_left = textline.x0
x_right = textline.x1 x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0 x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
return x_coord[align] return x_coord[align]
def find(self, x_coord, align): def find(self, x_coord, align):
@ -109,21 +116,22 @@ class TextEdges(object):
def update(self, textline): def update(self, textline):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
for align in ['left', 'right', 'middle']: for align in ["left", "right", "middle"]:
x_coord = self.get_x_coord(textline, align) x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align) idx = self.find(x_coord, align)
if idx is None: if idx is None:
self.add(textline, align) self.add(textline, align)
else: else:
self._textedges[align][idx].update_coords( self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol) x_coord, textline.y0, edge_tol=self.edge_tol
)
def generate(self, textlines): def generate(self, textlines):
"""Generates the text edges dict based on horizontal text """Generates the text edges dict based on horizontal text
rows. rows.
""" """
for tl in textlines: for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl) self.update(tl)
def get_relevant(self): def get_relevant(self):
@ -132,9 +140,15 @@ class TextEdges(object):
the most. the most.
""" """
intersections_sum = { intersections_sum = {
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid), "left": sum(
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid), te.intersections for te in self._textedges["left"] if te.is_valid
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid) ),
"right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid
),
"middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid
),
} }
# TODO: naive # TODO: naive
@ -147,6 +161,7 @@ class TextEdges(object):
"""Returns a dict of interesting table areas on the PDF page """Returns a dict of interesting table areas on the PDF page
calculated using relevant text edges. calculated using relevant text edges.
""" """
def pad(area, average_row_height): def pad(area, average_row_height):
x0 = area[0] - TABLE_AREA_PADDING x0 = area[0] - TABLE_AREA_PADDING
y0 = area[1] - TABLE_AREA_PADDING y0 = area[1] - TABLE_AREA_PADDING
@ -175,7 +190,11 @@ class TextEdges(object):
else: else:
table_areas.pop(found) table_areas.pop(found)
updated_area = ( updated_area = (
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1)) found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None table_areas[updated_area] = None
# extend table areas based on textlines that overlap # extend table areas based on textlines that overlap
@ -196,7 +215,11 @@ class TextEdges(object):
if found is not None: if found is not None:
table_areas.pop(found) table_areas.pop(found)
updated_area = ( updated_area = (
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) min(tl.x0, found[0]),
min(tl.y0, found[1]),
max(found[2], tl.x1),
max(found[3], tl.y1),
)
table_areas[updated_area] = None table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines)) average_textline_height = sum_textline_height / float(len(textlines))
@ -265,11 +288,12 @@ class Cell(object):
self.bottom = False self.bottom = False
self.hspan = False self.hspan = False
self.vspan = False self.vspan = False
self._text = '' self._text = ""
def __repr__(self): def __repr__(self):
return '<Cell x1={} y1={} x2={} y2={}>'.format( return "<Cell x1={} y1={} x2={} y2={}>".format(
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)) round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
)
@property @property
def text(self): def text(self):
@ -277,7 +301,7 @@ class Cell(object):
@text.setter @text.setter
def text(self, t): def text(self, t):
self._text = ''.join([self._text, t]) self._text = "".join([self._text, t])
@property @property
def bound(self): def bound(self):
@ -314,11 +338,11 @@ class Table(object):
PDF page number. PDF page number.
""" """
def __init__(self, cols, rows): def __init__(self, cols, rows):
self.cols = cols self.cols = cols
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
for c in cols] for r in rows]
self.df = None self.df = None
self.shape = (0, 0) self.shape = (0, 0)
self.accuracy = 0 self.accuracy = 0
@ -327,7 +351,7 @@ class Table(object):
self.page = None self.page = None
def __repr__(self): def __repr__(self):
return '<{} shape={}>'.format(self.__class__.__name__, self.shape) return "<{} shape={}>".format(self.__class__.__name__, self.shape)
def __lt__(self, other): def __lt__(self, other):
if self.page == other.page: if self.page == other.page:
@ -352,10 +376,10 @@ class Table(object):
""" """
# pretty? # pretty?
report = { report = {
'accuracy': round(self.accuracy, 2), "accuracy": round(self.accuracy, 2),
'whitespace': round(self.whitespace, 2), "whitespace": round(self.whitespace, 2),
'order': self.order, "order": self.order,
'page': self.page "page": self.page,
} }
return report return report
@ -383,12 +407,21 @@ class Table(object):
for v in vertical: for v in vertical:
# find closest x coord # find closest x coord
# iterate over y coords and find closest start and end points # iterate over y coords and find closest start and end points
i = [i for i, t in enumerate(self.cols) i = [
if np.isclose(v[0], t[0], atol=joint_tol)] i
j = [j for j, t in enumerate(self.rows) for i, t in enumerate(self.cols)
if np.isclose(v[3], t[0], atol=joint_tol)] if np.isclose(v[0], t[0], atol=joint_tol)
k = [k for k, t in enumerate(self.rows) ]
if np.isclose(v[1], t[0], atol=joint_tol)] j = [
j
for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_tol)
]
if not j: if not j:
continue continue
J = j[0] J = j[0]
@ -434,12 +467,21 @@ class Table(object):
for h in horizontal: for h in horizontal:
# find closest y coord # find closest y coord
# iterate over x coords and find closest start and end points # iterate over x coords and find closest start and end points
i = [i for i, t in enumerate(self.rows) i = [
if np.isclose(h[1], t[0], atol=joint_tol)] i
j = [j for j, t in enumerate(self.cols) for i, t in enumerate(self.rows)
if np.isclose(h[0], t[0], atol=joint_tol)] if np.isclose(h[1], t[0], atol=joint_tol)
k = [k for k, t in enumerate(self.cols) ]
if np.isclose(h[2], t[0], atol=joint_tol)] j = [
j
for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_tol)
]
if not j: if not j:
continue continue
J = j[0] J = j[0]
@ -537,12 +579,7 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = { kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
'encoding': 'utf-8',
'index': False,
'header': False,
'quoting': 1
}
kw.update(kwargs) kw.update(kwargs)
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
@ -557,12 +594,10 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = { kw = {"orient": "records"}
'orient': 'records'
}
kw.update(kwargs) kw.update(kwargs)
json_string = self.df.to_json(**kw) json_string = self.df.to_json(**kw)
with open(path, 'w') as f: with open(path, "w") as f:
f.write(json_string) f.write(json_string)
def to_excel(self, path, **kwargs): def to_excel(self, path, **kwargs):
@ -577,8 +612,8 @@ class Table(object):
""" """
kw = { kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), "sheet_name": "page-{}-table-{}".format(self.page, self.order),
'encoding': 'utf-8' "encoding": "utf-8",
} }
kw.update(kwargs) kw.update(kwargs)
writer = pd.ExcelWriter(path) writer = pd.ExcelWriter(path)
@ -597,7 +632,7 @@ class Table(object):
""" """
html_string = self.df.to_html(**kwargs) html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f: with open(path, "w") as f:
f.write(html_string) f.write(html_string)
def to_sqlite(self, path, **kwargs): def to_sqlite(self, path, **kwargs):
@ -611,13 +646,10 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = { kw = {"if_exists": "replace", "index": False}
'if_exists': 'replace',
'index': False
}
kw.update(kwargs) kw.update(kwargs)
conn = sqlite3.connect(path) conn = sqlite3.connect(path)
table_name = 'page-{}-table-{}'.format(self.page, self.order) table_name = "page-{}-table-{}".format(self.page, self.order)
self.df.to_sql(table_name, conn, **kw) self.df.to_sql(table_name, conn, **kw)
conn.commit() conn.commit()
conn.close() conn.close()
@ -633,12 +665,12 @@ class TableList(object):
Number of tables in the list. Number of tables in the list.
""" """
def __init__(self, tables): def __init__(self, tables):
self._tables = tables self._tables = tables
def __repr__(self): def __repr__(self):
return '<{} n={}>'.format( return "<{} n={}>".format(self.__class__.__name__, self.n)
self.__class__.__name__, self.n)
def __len__(self): def __len__(self):
return len(self._tables) return len(self._tables)
@ -648,37 +680,39 @@ class TableList(object):
@staticmethod @staticmethod
def _format_func(table, f): def _format_func(table, f):
return getattr(table, 'to_{}'.format(f)) return getattr(table, "to_{}".format(f))
@property @property
def n(self): def n(self):
return len(self) return len(self)
def _write_file(self, f=None, **kwargs): def _write_file(self, f=None, **kwargs):
dirname = kwargs.get('dirname') dirname = kwargs.get("dirname")
root = kwargs.get('root') root = kwargs.get("root")
ext = kwargs.get('ext') ext = kwargs.get("ext")
for table in self._tables: for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format( filename = os.path.join(
root, table.page, table.order, ext)) "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
)
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f) to_format = self._format_func(table, f)
to_format(filepath) to_format(filepath)
def _compress_dir(self, **kwargs): def _compress_dir(self, **kwargs):
path = kwargs.get('path') path = kwargs.get("path")
dirname = kwargs.get('dirname') dirname = kwargs.get("dirname")
root = kwargs.get('root') root = kwargs.get("root")
ext = kwargs.get('ext') ext = kwargs.get("ext")
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
for table in self._tables: for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format( filename = os.path.join(
root, table.page, table.order, ext)) "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
)
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
def export(self, path, f='csv', compress=False): def export(self, path, f="csv", compress=False):
"""Exports the list of tables to specified file format. """Exports the list of tables to specified file format.
Parameters Parameters
@ -697,33 +731,28 @@ class TableList(object):
if compress: if compress:
dirname = tempfile.mkdtemp() dirname = tempfile.mkdtemp()
kwargs = { kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
'path': path,
'dirname': dirname,
'root': root,
'ext': ext
}
if f in ['csv', 'json', 'html']: if f in ["csv", "json", "html"]:
self._write_file(f=f, **kwargs) self._write_file(f=f, **kwargs)
if compress: if compress:
self._compress_dir(**kwargs) self._compress_dir(**kwargs)
elif f == 'excel': elif f == "excel":
filepath = os.path.join(dirname, basename) filepath = os.path.join(dirname, basename)
writer = pd.ExcelWriter(filepath) writer = pd.ExcelWriter(filepath)
for table in self._tables: for table in self._tables:
sheet_name = 'page-{}-table-{}'.format(table.page, table.order) sheet_name = "page-{}-table-{}".format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8') table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
writer.save() writer.save()
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
elif f == 'sqlite': elif f == "sqlite":
filepath = os.path.join(dirname, basename) filepath = os.path.join(dirname, basename)
for table in self._tables: for table in self._tables:
table.to_sqlite(filepath) table.to_sqlite(filepath)
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))

View File

@ -24,10 +24,10 @@ ghostscript - A Python interface for the Ghostscript interpreter C-API
from . import _gsprint as gs from . import _gsprint as gs
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>' __author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>' __copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
__license__ = 'GNU General Public License version 3 (GPL v3)' __license__ = "GNU General Public License version 3 (GPL v3)"
__version__ = '0.6' __version__ = "0.6"
class __Ghostscript(object): class __Ghostscript(object):
@ -87,10 +87,13 @@ def Ghostscript(*args, **kwargs):
# Ghostscript only supports a single instance # Ghostscript only supports a single instance
if __instance__ is None: if __instance__ is None:
__instance__ = gs.new_instance() __instance__ = gs.new_instance()
return __Ghostscript(__instance__, args, return __Ghostscript(
stdin=kwargs.get('stdin', None), __instance__,
stdout=kwargs.get('stdout', None), args,
stderr=kwargs.get('stderr', None)) stdin=kwargs.get("stdin", None),
stdout=kwargs.get("stdout", None),
stderr=kwargs.get("stderr", None),
)
__instance__ = None __instance__ = None

View File

@ -42,10 +42,10 @@ e_Info = -110
# #
e_Quit = -101 e_Quit = -101
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>' __author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>' __copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
__license__ = 'GNU General Public License version 3 (GPL v3)' __license__ = "GNU General Public License version 3 (GPL v3)"
__version__ = '0.6' __version__ = "0.6"
gs_main_instance = c_void_p gs_main_instance = c_void_p
display_callback = c_void_p display_callback = c_void_p
@ -55,7 +55,7 @@ display_callback = c_void_p
class GhostscriptError(Exception): class GhostscriptError(Exception):
def __init__(self, ecode): def __init__(self, ecode):
self.code = ecode self.code = ecode
def new_instance(): def new_instance():
@ -89,6 +89,7 @@ def _wrap_stdin(infp):
"""Wrap a filehandle into a C function to be used as `stdin` callback """Wrap a filehandle into a C function to be used as `stdin` callback
for ``set_stdio``. The filehandle has to support the readline() method. for ``set_stdio``. The filehandle has to support the readline() method.
""" """
def _wrap(instance, dest, count): def _wrap(instance, dest, count):
try: try:
data = infp.readline(count) data = infp.readline(count)
@ -110,6 +111,7 @@ def _wrap_stdout(outfp):
`stderr` callback for ``set_stdio``. The filehandle has to support the `stderr` callback for ``set_stdio``. The filehandle has to support the
write() and flush() methods. write() and flush() methods.
""" """
def _wrap(instance, str, count): def _wrap(instance, str, count):
outfp.write(str[:count]) outfp.write(str[:count])
outfp.flush() outfp.flush()
@ -187,11 +189,23 @@ def __win32_finddll():
import winreg import winreg
except ImportError: except ImportError:
# assume Python 2 # assume Python 2
from _winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \ from _winreg import (
QueryInfoKey, HKEY_LOCAL_MACHINE OpenKey,
CloseKey,
EnumKey,
QueryValueEx,
QueryInfoKey,
HKEY_LOCAL_MACHINE,
)
else: else:
from winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \ from winreg import (
QueryInfoKey, HKEY_LOCAL_MACHINE OpenKey,
CloseKey,
EnumKey,
QueryValueEx,
QueryInfoKey,
HKEY_LOCAL_MACHINE,
)
from distutils.version import LooseVersion from distutils.version import LooseVersion
import os import os
@ -199,15 +213,19 @@ def __win32_finddll():
dlls = [] dlls = []
# Look up different variants of Ghostscript and take the highest # Look up different variants of Ghostscript and take the highest
# version for which the DLL is to be found in the filesystem. # version for which the DLL is to be found in the filesystem.
for key_name in ('AFPL Ghostscript', 'Aladdin Ghostscript', for key_name in (
'GNU Ghostscript', 'GPL Ghostscript'): "AFPL Ghostscript",
"Aladdin Ghostscript",
"GNU Ghostscript",
"GPL Ghostscript",
):
try: try:
k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name) k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name)
for num in range(0, QueryInfoKey(k1)[0]): for num in range(0, QueryInfoKey(k1)[0]):
version = EnumKey(k1, num) version = EnumKey(k1, num)
try: try:
k2 = OpenKey(k1, version) k2 = OpenKey(k1, version)
dll_path = QueryValueEx(k2, 'GS_DLL')[0] dll_path = QueryValueEx(k2, "GS_DLL")[0]
CloseKey(k2) CloseKey(k2)
if os.path.exists(dll_path): if os.path.exists(dll_path):
dlls.append((LooseVersion(version), dll_path)) dlls.append((LooseVersion(version), dll_path))
@ -223,21 +241,21 @@ def __win32_finddll():
return None return None
if sys.platform == 'win32': if sys.platform == "win32":
libgs = __win32_finddll() libgs = __win32_finddll()
if not libgs: if not libgs:
raise RuntimeError('Please make sure that Ghostscript is installed') raise RuntimeError("Please make sure that Ghostscript is installed")
libgs = windll.LoadLibrary(libgs) libgs = windll.LoadLibrary(libgs)
else: else:
try: try:
libgs = cdll.LoadLibrary('libgs.so') libgs = cdll.LoadLibrary("libgs.so")
except OSError: except OSError:
# shared object file not found # shared object file not found
import ctypes.util import ctypes.util
libgs = ctypes.util.find_library('gs') libgs = ctypes.util.find_library("gs")
if not libgs: if not libgs:
raise RuntimeError('Please make sure that Ghostscript is installed') raise RuntimeError("Please make sure that Ghostscript is installed")
libgs = cdll.LoadLibrary(libgs) libgs = cdll.LoadLibrary(libgs)
del __win32_finddll del __win32_finddll

View File

@ -7,8 +7,14 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, from .utils import (
get_rotation, is_url, download_url) TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
)
class PDFHandler(object): class PDFHandler(object):
@ -27,19 +33,20 @@ class PDFHandler(object):
Password for decryption. Password for decryption.
""" """
def __init__(self, filepath, pages='1', password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath): if is_url(filepath):
filepath = download_url(filepath) filepath = download_url(filepath)
self.filepath = filepath self.filepath = filepath
if not filepath.lower().endswith('.pdf'): if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
if password is None: if password is None:
self.password = '' self.password = ""
else: else:
self.password = password self.password = password
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
self.password = self.password.encode('ascii') self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages) self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages): def _get_pages(self, filepath, pages):
@ -60,26 +67,26 @@ class PDFHandler(object):
""" """
page_numbers = [] page_numbers = []
if pages == '1': if pages == "1":
page_numbers.append({'start': 1, 'end': 1}) page_numbers.append({"start": 1, "end": 1})
else: else:
infile = PdfFileReader(open(filepath, 'rb'), strict=False) infile = PdfFileReader(open(filepath, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
if pages == 'all': if pages == "all":
page_numbers.append({'start': 1, 'end': infile.getNumPages()}) page_numbers.append({"start": 1, "end": infile.getNumPages()})
else: else:
for r in pages.split(','): for r in pages.split(","):
if '-' in r: if "-" in r:
a, b = r.split('-') a, b = r.split("-")
if b == 'end': if b == "end":
b = infile.getNumPages() b = infile.getNumPages()
page_numbers.append({'start': int(a), 'end': int(b)}) page_numbers.append({"start": int(a), "end": int(b)})
else: else:
page_numbers.append({'start': int(r), 'end': int(r)}) page_numbers.append({"start": int(r), "end": int(r)})
P = [] P = []
for p in page_numbers: for p in page_numbers:
P.extend(range(p['start'], p['end'] + 1)) P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filepath, page, temp): def _save_page(self, filepath, page, temp):
@ -95,16 +102,16 @@ class PDFHandler(object):
Tmp directory. Tmp directory.
""" """
with open(filepath, 'rb') as fileobj: with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) fpath = os.path.join(temp, "page-{0}.pdf".format(page))
froot, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
with open(fpath, 'wb') as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
@ -112,23 +119,25 @@ class PDFHandler(object):
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '': if rotation != "":
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
outfile = PdfFileWriter() outfile = PdfFileWriter()
p = infile.getPage(0) p = infile.getPage(0)
if rotation == 'anticlockwise': if rotation == "anticlockwise":
p.rotateClockwise(90) p.rotateClockwise(90)
elif rotation == 'clockwise': elif rotation == "clockwise":
p.rotateCounterClockwise(90) p.rotateCounterClockwise(90)
outfile.addPage(p) outfile.addPage(p)
with open(fpath, 'wb') as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page PDFs. page PDFs.
@ -154,11 +163,13 @@ class PDFHandler(object):
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: for p in self.pages:
self._save_page(self.filepath, p, tempdir) self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) pages = [
for p in self.pages] os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) ]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages: for p in pages:
t = parser.extract_tables(p, suppress_stdout=suppress_stdout, t = parser.extract_tables(
layout_kwargs=layout_kwargs) p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t) tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -39,17 +39,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
if process_background: if process_background:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
cv2.THRESH_BINARY, blocksize, c) )
else: else:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
np.invert(gray), 255, np.invert(gray),
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blocksize,
c,
)
return img, threshold return img, threshold
def find_lines(threshold, regions=None, direction='horizontal', def find_lines(
line_scale=15, iterations=0): threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -87,15 +93,14 @@ def find_lines(threshold, regions=None, direction='horizontal',
""" """
lines = [] lines = []
if direction == 'vertical': if direction == "vertical":
size = threshold.shape[0] // line_scale size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal': elif direction == "horizontal":
size = threshold.shape[1] // line_scale size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError("Specify direction as either 'vertical' or" raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'")
" 'horizontal'")
if regions is not None: if regions is not None:
region_mask = np.zeros(threshold.shape) region_mask = np.zeros(threshold.shape)
@ -110,19 +115,21 @@ def find_lines(threshold, regions=None, direction='horizontal',
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
for c in contours: for c in contours:
x, y, w, h = cv2.boundingRect(c) x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w x1, x2 = x, x + w
y1, y2 = y, y + h y1, y2 = y, y + h
if direction == 'vertical': if direction == "vertical":
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1)) lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
elif direction == 'horizontal': elif direction == "horizontal":
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2)) lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
return dmask, lines return dmask, lines
@ -150,11 +157,13 @@ def find_contours(vertical, horizontal):
try: try:
__, contours, __ = cv2.findContours( __, contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, __ = cv2.findContours( contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
# sort in reverse based on contour area and use first 10 contours # sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
@ -196,11 +205,13 @@ def find_joints(contours, vertical, horizontal):
roi = joints[y : y + h, x : x + w] roi = joints[y : y + h, x : x + w]
try: try:
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
jc, __ = cv2.findContours( jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
if len(jc) <= 4: # remove contours with less than 4 joints if len(jc) <= 4: # remove contours with less than 4 joints
continue continue
joint_coords = [] joint_coords = []

View File

@ -6,8 +6,15 @@ from .handlers import PDFHandler
from .utils import validate_input, remove_extra from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', password=None, flavor='lattice', def read_pdf(
suppress_stdout=False, layout_kwargs={}, **kwargs): filepath,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs={},
**kwargs
):
"""Read PDF and return extracted tables. """Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream' Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -91,9 +98,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
if flavor not in ['lattice', 'stream']: if flavor not in ["lattice", "stream"]:
raise NotImplementedError("Unknown flavor specified." raise NotImplementedError(
" Use either 'lattice' or 'stream'") "Unknown flavor specified." " Use either 'lattice' or 'stream'"
)
with warnings.catch_warnings(): with warnings.catch_warnings():
if suppress_stdout: if suppress_stdout:
@ -102,6 +110,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
validate_input(kwargs, flavor=flavor) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password) p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, tables = p.parse(
layout_kwargs=layout_kwargs, **kwargs) flavor=flavor,
suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs,
**kwargs
)
return tables return tables

View File

@ -8,13 +8,13 @@ from ..utils import get_page_layout, get_text_objects
class BaseParser(object): class BaseParser(object):
"""Defines a base parser. """Defines a base parser.
""" """
def _generate_layout(self, filename, layout_kwargs): def _generate_layout(self, filename, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout( self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
filename, **layout_kwargs) self.images = get_text_objects(self.layout, ltype="image")
self.images = get_text_objects(self.layout, ltype='image') self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text') self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -14,14 +14,25 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, from ..utils import (
merge_close_lines, get_table_index, compute_accuracy, scale_image,
compute_whitespace) scale_pdf,
from ..image_processing import (adaptive_threshold, find_lines, segments_in_bbox,
find_contours, find_joints) text_in_bbox,
merge_close_lines,
get_table_index,
compute_accuracy,
compute_whitespace,
)
from ..image_processing import (
adaptive_threshold,
find_lines,
find_contours,
find_joints,
)
logger = logging.getLogger('camelot') logger = logging.getLogger("camelot")
class Lattice(BaseParser): class Lattice(BaseParser):
@ -83,11 +94,26 @@ class Lattice(BaseParser):
Resolution used for PDF to PNG conversion. Resolution used for PDF to PNG conversion.
""" """
def __init__(self, table_regions=None, table_areas=None, process_background=False,
line_scale=15, copy_text=None, shift_text=['l', 't'], def __init__(
split_text=False, flag_size=False, strip_text='', line_tol=2, self,
joint_tol=2, threshold_blocksize=15, threshold_constant=-2, table_regions=None,
iterations=0, resolution=300, **kwargs): table_areas=None,
process_background=False,
line_scale=15,
copy_text=None,
shift_text=["l", "t"],
split_text=False,
flag_size=False,
strip_text="",
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
threshold_constant=-2,
iterations=0,
resolution=300,
**kwargs
):
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
@ -130,19 +156,19 @@ class Lattice(BaseParser):
indices = [] indices = []
for r_idx, c_idx, text in idx: for r_idx, c_idx, text in idx:
for d in shift_text: for d in shift_text:
if d == 'l': if d == "l":
if t.cells[r_idx][c_idx].hspan: if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].left: while not t.cells[r_idx][c_idx].left:
c_idx -= 1 c_idx -= 1
if d == 'r': if d == "r":
if t.cells[r_idx][c_idx].hspan: if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].right: while not t.cells[r_idx][c_idx].right:
c_idx += 1 c_idx += 1
if d == 't': if d == "t":
if t.cells[r_idx][c_idx].vspan: if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].top: while not t.cells[r_idx][c_idx].top:
r_idx -= 1 r_idx -= 1
if d == 'b': if d == "b":
if t.cells[r_idx][c_idx].vspan: if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].bottom: while not t.cells[r_idx][c_idx].bottom:
r_idx += 1 r_idx += 1
@ -171,13 +197,13 @@ class Lattice(BaseParser):
if f == "h": if f == "h":
for i in range(len(t.cells)): for i in range(len(t.cells)):
for j in range(len(t.cells[i])): for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == '': if t.cells[i][j].text.strip() == "":
if t.cells[i][j].hspan and not t.cells[i][j].left: if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].text = t.cells[i][j - 1].text t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v": elif f == "v":
for i in range(len(t.cells)): for i in range(len(t.cells)):
for j in range(len(t.cells[i])): for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == '': if t.cells[i][j].text.strip() == "":
if t.cells[i][j].vspan and not t.cells[i][j].top: if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].text = t.cells[i - 1][j].text t.cells[i][j].text = t.cells[i - 1][j].text
return t return t
@ -185,11 +211,12 @@ class Lattice(BaseParser):
def _generate_image(self): def _generate_image(self):
from ..ext.ghostscript import Ghostscript from ..ext.ghostscript import Ghostscript
self.imagename = ''.join([self.rootname, '.png']) self.imagename = "".join([self.rootname, ".png"])
gs_call = '-q -sDEVICE=png16m -o {} -r300 {}'.format( gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename) self.imagename, self.filename
)
gs_call = gs_call.encode().split() gs_call = gs_call.encode().split()
null = open(os.devnull, 'wb') null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null) as gs: with Ghostscript(*gs_call, stdout=null) as gs:
pass pass
null.close() null.close()
@ -208,8 +235,11 @@ class Lattice(BaseParser):
return scaled_areas return scaled_areas
self.image, self.threshold = adaptive_threshold( self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background, self.imagename,
blocksize=self.threshold_blocksize, c=self.threshold_constant) process_background=self.process_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)
image_width = self.image.shape[1] image_width = self.image.shape[1]
image_height = self.image.shape[0] image_height = self.image.shape[0]
@ -226,21 +256,35 @@ class Lattice(BaseParser):
regions = scale_areas(self.table_regions) regions = scale_areas(self.table_regions)
vertical_mask, vertical_segments = find_lines( vertical_mask, vertical_segments = find_lines(
self.threshold, regions=regions, direction='vertical', self.threshold,
line_scale=self.line_scale, iterations=self.iterations) regions=regions,
direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines( horizontal_mask, horizontal_segments = find_lines(
self.threshold, regions=regions, direction='horizontal', self.threshold,
line_scale=self.line_scale, iterations=self.iterations) regions=regions,
direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
contours = find_contours(vertical_mask, horizontal_mask) contours = find_contours(vertical_mask, horizontal_mask)
table_bbox = find_joints(contours, vertical_mask, horizontal_mask) table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
else: else:
vertical_mask, vertical_segments = find_lines( vertical_mask, vertical_segments = find_lines(
self.threshold, direction='vertical', line_scale=self.line_scale, self.threshold,
iterations=self.iterations) direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines( horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal', line_scale=self.line_scale, self.threshold,
iterations=self.iterations) direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
areas = scale_areas(self.table_areas) areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask) table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
@ -248,18 +292,20 @@ class Lattice(BaseParser):
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox_unscaled = copy.deepcopy(table_bbox)
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers) table_bbox, vertical_segments, horizontal_segments, pdf_scalers
)
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox # select elements which lie within table_bbox
t_bbox = {} t_bbox = {}
v_s, h_s = segments_in_bbox( v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments) tk, self.vertical_segments, self.horizontal_segments
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) )
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0)) t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0)) t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox self.t_bbox = t_bbox
@ -268,23 +314,19 @@ class Lattice(BaseParser):
cols.extend([tk[0], tk[2]]) cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines( cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
sorted(cols), line_tol=self.line_tol) rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True), line_tol=self.line_tol)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
return cols, rows, v_s, h_s return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
v_s = kwargs.get('v_s') v_s = kwargs.get("v_s")
h_s = kwargs.get('h_s') h_s = kwargs.get("h_s")
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError('No segments found on {}'.format(self.rootname)) raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
@ -297,14 +339,21 @@ class Lattice(BaseParser):
pos_errors = [] pos_errors = []
# TODO: have a single list in place of two directional ones? # TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL # sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']: for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table,
flag_size=self.flag_size, strip_text=self.strip_text) t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
@ -317,11 +366,11 @@ class Lattice(BaseParser):
table.shape = table.df.shape table.shape = table.df.shape
whitespace = compute_whitespace(data) whitespace = compute_whitespace(data)
table.flavor = 'lattice' table.flavor = "lattice"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
@ -337,15 +386,18 @@ class Lattice(BaseParser):
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn('{} is image-based, camelot only works on' warnings.warn(
' text-based pages.'.format(os.path.basename(self.rootname))) "{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else: else:
warnings.warn('No tables found on {}'.format( warnings.warn(
os.path.basename(self.rootname))) "No tables found on {}".format(os.path.basename(self.rootname))
)
return [] return []
self._generate_image() self._generate_image()
@ -353,8 +405,9 @@ class Lattice(BaseParser):
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate(sorted( for table_idx, tk in enumerate(
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)): sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk table._bbox = tk

View File

@ -10,11 +10,10 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges, Table from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
compute_whitespace)
logger = logging.getLogger('camelot') logger = logging.getLogger("camelot")
class Stream(BaseParser): class Stream(BaseParser):
@ -55,9 +54,20 @@ class Stream(BaseParser):
to generate columns. to generate columns.
""" """
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_tol=50, row_tol=2, def __init__(
column_tol=0, **kwargs): self,
table_regions=None,
table_areas=None,
columns=None,
split_text=False,
flag_size=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
@ -150,8 +160,9 @@ class Stream(BaseParser):
else: else:
lower = merged[-1] lower = merged[-1]
if column_tol >= 0: if column_tol >= 0:
if (higher[0] <= lower[1] or if higher[0] <= lower[1] or np.isclose(
np.isclose(higher[0], lower[1], atol=column_tol)): higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
@ -186,13 +197,14 @@ class Stream(BaseParser):
List of continuous row y-coordinate tuples. List of continuous row y-coordinate tuples.
""" """
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) row_mids = [
if len(r) > 0 else 0 for r in rows_grouped] sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
for r in rows_grouped
]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max) rows.insert(0, text_y_max)
rows.append(text_y_min) rows.append(text_y_min)
rows = [(rows[i], rows[i + 1]) rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
for i in range(0, len(rows) - 1)]
return rows return rows
@staticmethod @staticmethod
@ -217,8 +229,9 @@ class Stream(BaseParser):
if text: if text:
text = Stream._group_rows(text, row_tol=row_tol) text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1) new_cols = [
for r in text if len(r) == max(elements) for t in r] (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
]
cols.extend(Stream._merge_columns(sorted(new_cols))) cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols return cols
@ -243,15 +256,13 @@ class Stream(BaseParser):
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min) cols.insert(0, text_x_min)
cols.append(text_x_max) cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
for i in range(0, len(cols) - 1)]
return cols return cols
def _validate_columns(self): def _validate_columns(self):
if self.table_areas is not None and self.columns is not None: if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns): if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns" " should be equal")
" should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
@ -309,16 +320,16 @@ class Stream(BaseParser):
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox # select elements which lie within table_bbox
t_bbox = {} t_bbox = {}
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0)) t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0)) t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol) rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -327,7 +338,7 @@ class Stream(BaseParser):
# take (0, pdf_width) by default # take (0, pdf_width) by default
# similar to else condition # similar to else condition
# len can't be 1 # len can't be 1
cols = self.columns[table_idx].split(',') cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols] cols = [float(c) for c in cols]
cols.insert(0, text_x_min) cols.insert(0, text_x_min)
cols.append(text_x_max) cols.append(text_x_max)
@ -346,20 +357,29 @@ class Stream(BaseParser):
if len(elements): if len(elements):
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
else: else:
warnings.warn("No tables found in table area {}".format( warnings.warn(
table_idx + 1)) "No tables found in table area {}".format(table_idx + 1)
)
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
right = cols[i][0] right = cols[i][0]
inner_text.extend([t for direction in self.t_bbox inner_text.extend(
for t in self.t_bbox[direction] [
if t.x0 > left and t.x1 < right]) t
outer_text = [t for direction in self.t_bbox for direction in self.t_bbox
for t in self.t_bbox[direction] for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
@ -373,11 +393,16 @@ class Stream(BaseParser):
pos_errors = [] pos_errors = []
# TODO: have a single list in place of two directional ones? # TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL # sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']: for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table,
flag_size=self.flag_size, strip_text=self.strip_text) t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
@ -389,11 +414,11 @@ class Stream(BaseParser):
table.shape = table.df.shape table.shape = table.df.shape
whitespace = compute_whitespace(data) whitespace = compute_whitespace(data)
table.flavor = 'stream' table.flavor = "stream"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
@ -409,23 +434,27 @@ class Stream(BaseParser):
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn('{} is image-based, camelot only works on' warnings.warn(
' text-based pages.'.format(os.path.basename(self.rootname))) "{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else: else:
warnings.warn('No tables found on {}'.format( warnings.warn(
os.path.basename(self.rootname))) "No tables found on {}".format(os.path.basename(self.rootname))
)
return [] return []
self._generate_table_bbox() self._generate_table_bbox()
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate(sorted( for table_idx, tk in enumerate(
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)): sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, tk) cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows) table = self._generate_table(table_idx, cols, rows)
table._bbox = tk table._bbox = tk

View File

@ -10,7 +10,7 @@ else:
class PlotMethods(object): class PlotMethods(object):
def __call__(self, table, kind='text', filename=None): def __call__(self, table, kind="text", filename=None):
"""Plot elements found on PDF page based on kind """Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different specified, useful for debugging and playing with different
parameters to get the best output. parameters to get the best output.
@ -31,14 +31,16 @@ class PlotMethods(object):
""" """
if not _HAS_MPL: if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.') raise ImportError("matplotlib is required for plotting.")
if table.flavor == 'lattice' and kind in ['textedge']: if table.flavor == "lattice" and kind in ["textedge"]:
raise NotImplementedError("Lattice flavor does not support kind='{}'".format( raise NotImplementedError(
kind)) "Lattice flavor does not support kind='{}'".format(kind)
elif table.flavor == 'stream' and kind in ['joint', 'line']: )
raise NotImplementedError("Stream flavor does not support kind='{}'".format( elif table.flavor == "stream" and kind in ["joint", "line"]:
kind)) raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind)
)
plot_method = getattr(self, kind) plot_method = getattr(self, kind)
return plot_method(table) return plot_method(table)
@ -57,18 +59,12 @@ class PlotMethods(object):
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], [] xs, ys = [], []
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig return fig
@ -87,21 +83,17 @@ class PlotMethods(object):
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
for row in table.cells: for row in table.cells:
for cell in row: for cell in row:
if cell.left: if cell.left:
ax.plot([cell.lb[0], cell.lt[0]], ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
[cell.lb[1], cell.lt[1]])
if cell.right: if cell.right:
ax.plot([cell.rb[0], cell.rt[0]], ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
[cell.rb[1], cell.rt[1]])
if cell.top: if cell.top:
ax.plot([cell.lt[0], cell.rt[0]], ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
[cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
[cell.lb[1], cell.rb[1]])
return fig return fig
def contour(self, table): def contour(self, table):
@ -124,7 +116,7 @@ class PlotMethods(object):
img, table_bbox = (None, {table._bbox: None}) img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False _FOR_LATTICE = False
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], [] xs, ys = [], []
if not _FOR_LATTICE: if not _FOR_LATTICE:
@ -133,21 +125,14 @@ class PlotMethods(object):
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
t[2] - t[0],
t[3] - t[1],
color='blue'
) )
) )
for t in table_bbox.keys(): for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
t[2] - t[0],
t[3] - t[1],
fill=False,
color='red'
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
@ -173,25 +158,19 @@ class PlotMethods(object):
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], [] xs, ys = [], []
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
color='blue'
)
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
for te in table._textedges: for te in table._textedges:
ax.plot([te.x, te.x], ax.plot([te.x, te.x], [te.y0, te.y1])
[te.y0, te.y1])
return fig return fig
@ -210,14 +189,14 @@ class PlotMethods(object):
""" """
img, table_bbox = table._image img, table_bbox = table._image
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
x_coord = [] x_coord = []
y_coord = [] y_coord = []
for k in table_bbox.keys(): for k in table_bbox.keys():
for coord in table_bbox[k]: for coord in table_bbox[k]:
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
ax.plot(x_coord, y_coord, 'ro') ax.plot(x_coord, y_coord, "ro")
ax.imshow(img) ax.imshow(img)
return fig return fig
@ -235,7 +214,7 @@ class PlotMethods(object):
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect="equal")
vertical, horizontal = table._segments vertical, horizontal = table._segments
for v in vertical: for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])

View File

@ -19,8 +19,14 @@ from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, from pdfminer.layout import (
LTTextLineVertical, LTImage) LAParams,
LTAnno,
LTChar,
LTTextLineHorizontal,
LTTextLineVertical,
LTImage,
)
PY3 = sys.version_info[0] >= 3 PY3 = sys.version_info[0] >= 3
@ -35,7 +41,7 @@ else:
_VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('') _VALID_URLS.discard("")
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
@ -59,9 +65,11 @@ def is_url(url):
def random_string(length): def random_string(length):
ret = '' ret = ""
while length: while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase) ret += random.choice(
string.digits + string.ascii_lowercase + string.ascii_uppercase
)
length -= 1 length -= 1
return ret return ret
@ -79,14 +87,14 @@ def download_url(url):
Temporary filepath. Temporary filepath.
""" """
filename = '{}.pdf'.format(random_string(6)) filename = "{}.pdf".format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f: with tempfile.NamedTemporaryFile("wb", delete=False) as f:
obj = urlopen(url) obj = urlopen(url)
if PY3: if PY3:
content_type = obj.info().get_content_type() content_type = obj.info().get_content_type()
else: else:
content_type = obj.info().getheader('Content-Type') content_type = obj.info().getheader("Content-Type")
if content_type != 'application/pdf': if content_type != "application/pdf":
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
f.write(obj.read()) f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename) filepath = os.path.join(os.path.dirname(f.name), filename)
@ -94,39 +102,38 @@ def download_url(url):
return filepath return filepath
stream_kwargs = [ stream_kwargs = ["columns", "row_tol", "column_tol"]
'columns',
'row_tol',
'column_tol'
]
lattice_kwargs = [ lattice_kwargs = [
'process_background', "process_background",
'line_scale', "line_scale",
'copy_text', "copy_text",
'shift_text', "shift_text",
'line_tol', "line_tol",
'joint_tol', "joint_tol",
'threshold_blocksize', "threshold_blocksize",
'threshold_constant', "threshold_constant",
'iterations' "iterations",
] ]
def validate_input(kwargs, flavor='lattice'): def validate_input(kwargs, flavor="lattice"):
def check_intersection(parser_kwargs, input_kwargs): def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec: if isec:
raise ValueError("{} cannot be used with flavor='{}'".format( raise ValueError(
",".join(sorted(isec)), flavor)) "{} cannot be used with flavor='{}'".format(
",".join(sorted(isec)), flavor
)
)
if flavor == 'lattice': if flavor == "lattice":
check_intersection(stream_kwargs, kwargs) check_intersection(stream_kwargs, kwargs)
else: else:
check_intersection(lattice_kwargs, kwargs) check_intersection(lattice_kwargs, kwargs)
def remove_extra(kwargs, flavor='lattice'): def remove_extra(kwargs, flavor="lattice"):
if flavor == 'lattice': if flavor == "lattice":
for key in kwargs.keys(): for key in kwargs.keys():
if key in stream_kwargs: if key in stream_kwargs:
kwargs.pop(key) kwargs.pop(key)
@ -256,15 +263,19 @@ def scale_image(tables, v_segments, h_segments, factors):
v_segments_new = [] v_segments_new = []
for v in v_segments: for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x) x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale( y1, y2 = (
abs(translate(-img_y, v[3])), scaling_factor_y) scale(abs(translate(-img_y, v[1])), scaling_factor_y),
scale(abs(translate(-img_y, v[3])), scaling_factor_y),
)
v_segments_new.append((x1, y1, x2, y2)) v_segments_new.append((x1, y1, x2, y2))
h_segments_new = [] h_segments_new = []
for h in h_segments: for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x) x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale( y1, y2 = (
abs(translate(-img_y, h[3])), scaling_factor_y) scale(abs(translate(-img_y, h[1])), scaling_factor_y),
scale(abs(translate(-img_y, h[3])), scaling_factor_y),
)
h_segments_new.append((x1, y1, x2, y2)) h_segments_new.append((x1, y1, x2, y2))
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
@ -291,13 +302,13 @@ def get_rotation(chars, horizontal_text, vertical_text):
rotated 90 degree clockwise. rotated 90 degree clockwise.
""" """
rotation = '' rotation = ""
hlen = len([t for t in horizontal_text if t.get_text().strip()]) hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
return rotation return rotation
@ -325,10 +336,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
""" """
lb = (bbox[0], bbox[1]) lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3]) rt = (bbox[2], bbox[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v_s = [
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] v
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and for v in v_segments
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
]
h_s = [
h
for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
]
return v_s, h_s return v_s, h_s
@ -351,9 +368,12 @@ def text_in_bbox(bbox, text):
""" """
lb = (bbox[0], bbox[1]) lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3]) rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 t_bbox = [
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 t
<= rt[1] + 2] for t in text
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
]
return t_bbox return t_bbox
@ -390,7 +410,7 @@ def merge_close_lines(ar, line_tol=2):
# (inspired from sklearn.pipeline.Pipeline) # (inspired from sklearn.pipeline.Pipeline)
def flag_font_size(textline, direction, strip_text=''): def flag_font_size(textline, direction, strip_text=""):
"""Flags super/subscripts in text by enclosing them with <s></s>. """Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives. May give false positives.
@ -409,10 +429,18 @@ def flag_font_size(textline, direction, strip_text=''):
fstring : string fstring : string
""" """
if direction == 'horizontal': if direction == "horizontal":
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] d = [
elif direction == 'vertical': (t.get_text(), np.round(t.height, decimals=6))
d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)] for t in textline
if not isinstance(t, LTAnno)
]
elif direction == "vertical":
d = [
(t.get_text(), np.round(t.width, decimals=6))
for t in textline
if not isinstance(t, LTAnno)
]
l = [np.round(size, decimals=6) for text, size in d] l = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1: if len(set(l)) > 1:
flist = [] flist = []
@ -420,21 +448,21 @@ def flag_font_size(textline, direction, strip_text=''):
for key, chars in groupby(d, itemgetter(1)): for key, chars in groupby(d, itemgetter(1)):
if key == min_size: if key == min_size:
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
if ''.join(fchars).strip(): if "".join(fchars).strip():
fchars.insert(0, '<s>') fchars.insert(0, "<s>")
fchars.append('</s>') fchars.append("</s>")
flist.append(''.join(fchars)) flist.append("".join(fchars))
else: else:
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
if ''.join(fchars).strip(): if "".join(fchars).strip():
flist.append(''.join(fchars)) flist.append("".join(fchars))
fstring = ''.join(flist).strip(strip_text) fstring = "".join(flist).strip(strip_text)
else: else:
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text) fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
return fstring return fstring
def split_textline(table, textline, direction, flag_size=False, strip_text=''): def split_textline(table, textline, direction, flag_size=False, strip_text=""):
"""Splits PDFMiner LTTextLine into substrings if it spans across """Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns. multiple rows/columns.
@ -464,19 +492,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
cut_text = [] cut_text = []
bbox = textline.bbox bbox = textline.bbox
try: try:
if direction == 'horizontal' and not textline.is_empty(): if direction == "horizontal" and not textline.is_empty():
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] x_overlap = [
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] i
for i, x in enumerate(table.cols)
if x[0] <= bbox[2] and bbox[0] <= x[1]
]
r_idx = [
j
for j, r in enumerate(table.rows)
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
]
r = r_idx[0] r = r_idx[0]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
]
if not x_cuts: if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs: for obj in textline._objs:
row = table.rows[r] row = table.rows[r]
for cut in x_cuts: for cut in x_cuts:
if isinstance(obj, LTChar): if isinstance(obj, LTChar):
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and if (
(obj.x0 + obj.x1) / 2 <= cut[1]): row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
and (obj.x0 + obj.x1) / 2 <= cut[1]
):
cut_text.append((r, cut[0], obj)) cut_text.append((r, cut[0], obj))
break break
else: else:
@ -485,19 +525,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
cut_text.append((r, cut[0] + 1, obj)) cut_text.append((r, cut[0] + 1, obj))
elif isinstance(obj, LTAnno): elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj)) cut_text.append((r, cut[0], obj))
elif direction == 'vertical' and not textline.is_empty(): elif direction == "vertical" and not textline.is_empty():
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] y_overlap = [
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] j
for j, y in enumerate(table.rows)
if y[1] <= bbox[3] and bbox[1] <= y[0]
]
c_idx = [
i
for i, c in enumerate(table.cols)
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
]
c = c_idx[0] c = c_idx[0]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
]
if not y_cuts: if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs: for obj in textline._objs:
col = table.cols[c] col = table.cols[c]
for cut in y_cuts: for cut in y_cuts:
if isinstance(obj, LTChar): if isinstance(obj, LTChar):
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and if (
(obj.y0 + obj.y1) / 2 >= cut[1]): col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj)) cut_text.append((cut[0], c, obj))
break break
else: else:
@ -511,15 +563,24 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
grouped_chars = [] grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)): for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size: if flag_size:
grouped_chars.append((key[0], key[1], grouped_chars.append(
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text))) (
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
),
)
)
else: else:
gchars = [t[2].get_text() for t in chars] gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text))) grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
return grouped_chars return grouped_chars
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',): def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
):
"""Gets indices of the table cell where given text object lies by """Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates. comparing their y and x-coordinates.
@ -558,8 +619,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
""" """
r_idx, c_idx = [-1] * 2 r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)): for r in range(len(table.rows)):
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
(t.y0 + t.y1) / 2.0 > table.rows[r][1]): r
][1]:
lt_col_overlap = [] lt_col_overlap = []
for c in table.cols: for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0: if c[0] <= t.x1 and c[1] >= t.x0:
@ -569,11 +631,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
else: else:
lt_col_overlap.append(-1) lt_col_overlap.append(-1)
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0: if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
text = t.get_text().strip('\n') text = t.get_text().strip("\n")
text_range = (t.x0, t.x1) text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1]) col_range = (table.cols[0][0], table.cols[-1][1])
warnings.warn("{} {} does not lie in column range {}".format( warnings.warn(
text, text_range, col_range)) "{} {} does not lie in column range {}".format(
text, text_range, col_range
)
)
r_idx = r r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap)) c_idx = lt_col_overlap.index(max(lt_col_overlap))
break break
@ -594,10 +659,24 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text: if split_text:
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error return (
split_textline(
table, t, direction, flag_size=flag_size, strip_text=strip_text
),
error,
)
else: else:
if flag_size: if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error return (
[
(
r_idx,
c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text),
)
],
error,
)
else: else:
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
@ -650,14 +729,20 @@ def compute_whitespace(d):
r_nempty_cells, c_nempty_cells = [], [] r_nempty_cells, c_nempty_cells = [], []
for i in d: for i in d:
for j in i: for j in i:
if j.strip() == '': if j.strip() == "":
whitespace += 1 whitespace += 1
whitespace = 100 * (whitespace / float(len(d) * len(d[0]))) whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
return whitespace return whitespace
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, def get_page_layout(
detect_vertical=True, all_texts=True): filename,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1,
detect_vertical=True,
all_texts=True,
):
"""Returns a PDFMiner LTPage object and page dimension of a single """Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs. of kwargs.
@ -680,16 +765,18 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
Dimension of pdf page in the form (width, height). Dimension of pdf page in the form (width, height).
""" """
with open(filename, 'rb') as f: with open(filename, "rb") as f:
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
if not document.is_extractable: if not document.is_extractable:
raise PDFTextExtractionNotAllowed raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin, laparams = LAParams(
line_margin=line_margin, char_margin=char_margin,
word_margin=word_margin, line_margin=line_margin,
detect_vertical=detect_vertical, word_margin=word_margin,
all_texts=all_texts) detect_vertical=detect_vertical,
all_texts=all_texts,
)
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
@ -721,13 +808,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects. List of PDFMiner text objects.
""" """
if ltype == 'char': if ltype == "char":
LTObject = LTChar LTObject = LTChar
elif ltype == 'image': elif ltype == "image":
LTObject = LTImage LTObject = LTImage
elif ltype == 'horizontal_text': elif ltype == "horizontal_text":
LTObject = LTTextLineHorizontal LTObject = LTTextLineHorizontal
elif ltype == 'vertical_text': elif ltype == "vertical_text":
LTObject = LTTextLineVertical LTObject = LTTextLineVertical
if t is None: if t is None:
t = [] t = []