Merge pull request #1 from camelot-dev/blacken-code

[MRG] Blacken code
pull/3/head
Vinayak Mehta 2019-07-04 00:20:57 +05:30 committed by GitHub
commit 9137df2f6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 892 additions and 551 deletions

View File

@ -9,8 +9,8 @@ from .io import read_pdf
from .plotting import PlotMethods
def _write_usage(self, prog, args='', prefix='Usage: '):
return self._write_usage('camelot', args, prefix=prefix)
def _write_usage(self, prog, args="", prefix="Usage: "):
return self._write_usage("camelot", args, prefix=prefix)
# monkey patch click.HelpFormatter
@ -18,10 +18,10 @@ HelpFormatter._write_usage = HelpFormatter.write_usage
HelpFormatter.write_usage = _write_usage
# set up logging
logger = logging.getLogger('camelot')
logger = logging.getLogger("camelot")
format_string = '%(asctime)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
format_string = "%(asctime)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
handler = logging.StreamHandler()
handler.setFormatter(formatter)

View File

@ -3,7 +3,7 @@
from __future__ import absolute_import
__all__ = ('main',)
__all__ = ("main",)
def main():

View File

@ -1,23 +1,23 @@
# -*- coding: utf-8 -*-
VERSION = (0, 7, 2)
PRERELEASE = None # alpha, beta or rc
PRERELEASE = None # alpha, beta or rc
REVISION = None
def generate_version(version, prerelease=None, revision=None):
version_parts = ['.'.join(map(str, version))]
version_parts = [".".join(map(str, version))]
if prerelease is not None:
version_parts.append('-{}'.format(prerelease))
version_parts.append("-{}".format(prerelease))
if revision is not None:
version_parts.append('.{}'.format(revision))
return ''.join(version_parts)
version_parts.append(".{}".format(revision))
return "".join(version_parts)
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
__title__ = "camelot-py"
__description__ = "PDF Table Extraction for Humans."
__url__ = "http://camelot-py.readthedocs.io/"
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License'
__author__ = "Vinayak Mehta"
__author_email__ = "vmehta94@gmail.com"
__license__ = "MIT License"

View File

@ -3,6 +3,7 @@
import logging
import click
try:
import matplotlib.pyplot as plt
except ImportError:
@ -13,7 +14,7 @@ else:
from . import __version__, read_pdf, plot
logger = logging.getLogger('camelot')
logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO)
@ -30,23 +31,47 @@ pass_config = click.make_pass_decorator(Config)
@click.group()
@click.version_option(version=__version__)
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' Example: 1,3,4 or 1,4-end or all.')
@click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
@click.option('-split', '--split_text', is_flag=True,
help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
' font size. Useful to detect super/subscripts.')
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
' assigning it to a cell.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
help='PDFMiner char_margin, line_margin and word_margin.')
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
@click.option(
"-p",
"--pages",
default="1",
help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
)
@click.option("-pw", "--password", help="Password for decryption.")
@click.option("-o", "--output", help="Output file path.")
@click.option(
"-f",
"--format",
type=click.Choice(["csv", "json", "excel", "html", "sqlite"]),
help="Output file format.",
)
@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
@click.option(
"-split",
"--split_text",
is_flag=True,
help="Split text that spans across multiple cells.",
)
@click.option(
"-flag",
"--flag_size",
is_flag=True,
help="Flag text based on" " font size. Useful to detect super/subscripts.",
)
@click.option(
"-strip",
"--strip_text",
help="Characters that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
"-M",
"--margins",
nargs=3,
default=(1.0, 0.5, 0.1),
help="PDFMiner char_margin, line_margin and word_margin.",
)
@click.pass_context
def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans"""
@ -55,79 +80,131 @@ def cli(ctx, *args, **kwargs):
ctx.obj.set_config(key, value)
@cli.command('lattice')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
help='Process background lines.')
@click.option('-scale', '--line_scale', default=15,
help='Line size scaling factor. The larger the value,'
' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
multiple=True, help='Direction in which text in a spanning cell'
' will be copied over.')
@click.option('-shift', '--shift_text', default=['l', 't'],
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
help='Direction in which text in a spanning cell will flow.')
@click.option('-l', '--line_tol', default=2,
help='Tolerance parameter used to merge close vertical'
' and horizontal lines.')
@click.option('-j', '--joint_tol', default=2,
help='Tolerance parameter used to decide whether'
' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15,
help='For adaptive thresholding, size of a pixel'
' neighborhood that is used to calculate a threshold value for'
' the pixel. Example: 3, 5, 7, and so on.')
@click.option('-const', '--threshold_constant', default=-2,
help='For adaptive thresholding, constant subtracted'
' from the mean or weighted mean. Normally, it is positive but'
' may be zero or negative as well.')
@click.option('-I', '--iterations', default=0,
help='Number of times for erosion/dilation will be applied.')
@click.option('-res', '--resolution', default=300,
help='Resolution used for PDF to PNG conversion.')
@click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@cli.command("lattice")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-back", "--process_background", is_flag=True, help="Process background lines."
)
@click.option(
"-scale",
"--line_scale",
default=15,
help="Line size scaling factor. The larger the value,"
" the smaller the detected lines.",
)
@click.option(
"-copy",
"--copy_text",
default=[],
type=click.Choice(["h", "v"]),
multiple=True,
help="Direction in which text in a spanning cell" " will be copied over.",
)
@click.option(
"-shift",
"--shift_text",
default=["l", "t"],
type=click.Choice(["", "l", "r", "t", "b"]),
multiple=True,
help="Direction in which text in a spanning cell will flow.",
)
@click.option(
"-l",
"--line_tol",
default=2,
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
)
@click.option(
"-j",
"--joint_tol",
default=2,
help="Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.",
)
@click.option(
"-block",
"--threshold_blocksize",
default=15,
help="For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel. Example: 3, 5, 7, and so on.",
)
@click.option(
"-const",
"--threshold_constant",
default=-2,
help="For adaptive thresholding, constant subtracted"
" from the mean or weighted mean. Normally, it is positive but"
" may be zero or negative as well.",
)
@click.option(
"-I",
"--iterations",
default=0,
help="Number of times for erosion/dilation will be applied.",
)
@click.option(
"-res",
"--resolution",
default=300,
help="Resolution used for PDF to PNG conversion.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "joint", "line"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def lattice(c, *args, **kwargs):
"""Use lines between text to parse the table."""
conf = c.config
pages = conf.pop('pages')
output = conf.pop('output')
f = conf.pop('format')
compress = conf.pop('zip')
quiet = conf.pop('quiet')
plot_type = kwargs.pop('plot_type')
filepath = kwargs.pop('filepath')
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
copy_text = list(kwargs["copy_text"])
kwargs["copy_text"] = None if not copy_text else copy_text
kwargs["shift_text"] = list(kwargs["shift_text"])
if plot_type is not None:
if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.')
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError('Please specify output file path using --output')
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError('Please specify output file format using --format')
raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(filepath, pages=pages, flavor='lattice',
suppress_stdout=quiet, **kwargs)
click.echo('Found {} tables'.format(tables.n))
tables = read_pdf(
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
@ -136,57 +213,89 @@ def lattice(c, *args, **kwargs):
tables.export(output, f=f, compress=compress)
@cli.command('stream')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True,
help='X coordinates of column separators.')
@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
' for extending textedges vertically.')
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
' used to combine text vertically, to generate rows.')
@click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@cli.command("stream")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def stream(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop('pages')
output = conf.pop('output')
f = conf.pop('format')
compress = conf.pop('zip')
quiet = conf.pop('quiet')
plot_type = kwargs.pop('plot_type')
filepath = kwargs.pop('filepath')
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns
if plot_type is not None:
if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.')
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError('Please specify output file path using --output')
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError('Please specify output file format using --format')
raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(filepath, pages=pages, flavor='stream',
suppress_stdout=quiet, **kwargs)
click.echo('Found {} tables'.format(tables.n))
tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)

View File

@ -42,7 +42,8 @@ class TextEdge(object):
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
"""
def __init__(self, x, y0, y1, align='left'):
def __init__(self, x, y0, y1, align="left"):
self.x = x
self.y0 = y0
self.y1 = y1
@ -51,8 +52,13 @@ class TextEdge(object):
self.is_valid = False
def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
round(self.x, 2),
round(self.y0, 2),
round(self.y1, 2),
self.align,
self.is_valid,
)
def update_coords(self, x, y0, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets
@ -73,9 +79,10 @@ class TextEdges(object):
the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects.
"""
def __init__(self, edge_tol=50):
self.edge_tol = edge_tol
self._textedges = {'left': [], 'right': [], 'middle': []}
self._textedges = {"left": [], "right": [], "middle": []}
@staticmethod
def get_x_coord(textline, align):
@ -85,7 +92,7 @@ class TextEdges(object):
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
return x_coord[align]
def find(self, x_coord, align):
@ -109,21 +116,22 @@ class TextEdges(object):
def update(self, textline):
"""Updates an existing text edge in the current dict.
"""
for align in ['left', 'right', 'middle']:
for align in ["left", "right", "middle"]:
x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align)
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol)
x_coord, textline.y0, edge_tol=self.edge_tol
)
def generate(self, textlines):
"""Generates the text edges dict based on horizontal text
rows.
"""
for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl)
def get_relevant(self):
@ -132,9 +140,15 @@ class TextEdges(object):
the most.
"""
intersections_sum = {
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
"left": sum(
te.intersections for te in self._textedges["left"] if te.is_valid
),
"right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid
),
"middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid
),
}
# TODO: naive
@ -147,6 +161,7 @@ class TextEdges(object):
"""Returns a dict of interesting table areas on the PDF page
calculated using relevant text edges.
"""
def pad(area, average_row_height):
x0 = area[0] - TABLE_AREA_PADDING
y0 = area[1] - TABLE_AREA_PADDING
@ -175,7 +190,11 @@ class TextEdges(object):
else:
table_areas.pop(found)
updated_area = (
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None
# extend table areas based on textlines that overlap
@ -196,7 +215,11 @@ class TextEdges(object):
if found is not None:
table_areas.pop(found)
updated_area = (
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
min(tl.x0, found[0]),
min(tl.y0, found[1]),
max(found[2], tl.x1),
max(found[3], tl.y1),
)
table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines))
@ -265,11 +288,12 @@ class Cell(object):
self.bottom = False
self.hspan = False
self.vspan = False
self._text = ''
self._text = ""
def __repr__(self):
return '<Cell x1={} y1={} x2={} y2={}>'.format(
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2))
return "<Cell x1={} y1={} x2={} y2={}>".format(
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
)
@property
def text(self):
@ -277,7 +301,7 @@ class Cell(object):
@text.setter
def text(self, t):
self._text = ''.join([self._text, t])
self._text = "".join([self._text, t])
@property
def bound(self):
@ -314,11 +338,11 @@ class Table(object):
PDF page number.
"""
def __init__(self, cols, rows):
self.cols = cols
self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows]
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
self.df = None
self.shape = (0, 0)
self.accuracy = 0
@ -327,7 +351,7 @@ class Table(object):
self.page = None
def __repr__(self):
return '<{} shape={}>'.format(self.__class__.__name__, self.shape)
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
def __lt__(self, other):
if self.page == other.page:
@ -352,10 +376,10 @@ class Table(object):
"""
# pretty?
report = {
'accuracy': round(self.accuracy, 2),
'whitespace': round(self.whitespace, 2),
'order': self.order,
'page': self.page
"accuracy": round(self.accuracy, 2),
"whitespace": round(self.whitespace, 2),
"order": self.order,
"page": self.page,
}
return report
@ -383,12 +407,21 @@ class Table(object):
for v in vertical:
# find closest x coord
# iterate over y coords and find closest start and end points
i = [i for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_tol)]
i = [
i
for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=joint_tol)
]
j = [
j
for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_tol)
]
if not j:
continue
J = j[0]
@ -434,12 +467,21 @@ class Table(object):
for h in horizontal:
# find closest y coord
# iterate over x coords and find closest start and end points
i = [i for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_tol)]
i = [
i
for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=joint_tol)
]
j = [
j
for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_tol)
]
if not j:
continue
J = j[0]
@ -537,12 +579,7 @@ class Table(object):
Output filepath.
"""
kw = {
'encoding': 'utf-8',
'index': False,
'header': False,
'quoting': 1
}
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
kw.update(kwargs)
self.df.to_csv(path, **kw)
@ -557,12 +594,10 @@ class Table(object):
Output filepath.
"""
kw = {
'orient': 'records'
}
kw = {"orient": "records"}
kw.update(kwargs)
json_string = self.df.to_json(**kw)
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(json_string)
def to_excel(self, path, **kwargs):
@ -577,8 +612,8 @@ class Table(object):
"""
kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
'encoding': 'utf-8'
"sheet_name": "page-{}-table-{}".format(self.page, self.order),
"encoding": "utf-8",
}
kw.update(kwargs)
writer = pd.ExcelWriter(path)
@ -597,7 +632,7 @@ class Table(object):
"""
html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(html_string)
def to_sqlite(self, path, **kwargs):
@ -611,13 +646,10 @@ class Table(object):
Output filepath.
"""
kw = {
'if_exists': 'replace',
'index': False
}
kw = {"if_exists": "replace", "index": False}
kw.update(kwargs)
conn = sqlite3.connect(path)
table_name = 'page-{}-table-{}'.format(self.page, self.order)
table_name = "page-{}-table-{}".format(self.page, self.order)
self.df.to_sql(table_name, conn, **kw)
conn.commit()
conn.close()
@ -633,12 +665,12 @@ class TableList(object):
Number of tables in the list.
"""
def __init__(self, tables):
self._tables = tables
def __repr__(self):
return '<{} n={}>'.format(
self.__class__.__name__, self.n)
return "<{} n={}>".format(self.__class__.__name__, self.n)
def __len__(self):
return len(self._tables)
@ -648,37 +680,39 @@ class TableList(object):
@staticmethod
def _format_func(table, f):
return getattr(table, 'to_{}'.format(f))
return getattr(table, "to_{}".format(f))
@property
def n(self):
return len(self)
def _write_file(self, f=None, **kwargs):
dirname = kwargs.get('dirname')
root = kwargs.get('root')
ext = kwargs.get('ext')
dirname = kwargs.get("dirname")
root = kwargs.get("root")
ext = kwargs.get("ext")
for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format(
root, table.page, table.order, ext))
filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
)
filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f)
to_format(filepath)
def _compress_dir(self, **kwargs):
path = kwargs.get('path')
dirname = kwargs.get('dirname')
root = kwargs.get('root')
ext = kwargs.get('ext')
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
path = kwargs.get("path")
dirname = kwargs.get("dirname")
root = kwargs.get("root")
ext = kwargs.get("ext")
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format(
root, table.page, table.order, ext))
filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
)
filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath))
def export(self, path, f='csv', compress=False):
def export(self, path, f="csv", compress=False):
"""Exports the list of tables to specified file format.
Parameters
@ -697,33 +731,28 @@ class TableList(object):
if compress:
dirname = tempfile.mkdtemp()
kwargs = {
'path': path,
'dirname': dirname,
'root': root,
'ext': ext
}
kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
if f in ['csv', 'json', 'html']:
if f in ["csv", "json", "html"]:
self._write_file(f=f, **kwargs)
if compress:
self._compress_dir(**kwargs)
elif f == 'excel':
elif f == "excel":
filepath = os.path.join(dirname, basename)
writer = pd.ExcelWriter(filepath)
for table in self._tables:
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
sheet_name = "page-{}-table-{}".format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
writer.save()
if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))
elif f == 'sqlite':
elif f == "sqlite":
filepath = os.path.join(dirname, basename)
for table in self._tables:
table.to_sqlite(filepath)
if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))

View File

@ -24,10 +24,10 @@ ghostscript - A Python interface for the Ghostscript interpreter C-API
from . import _gsprint as gs
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
__license__ = 'GNU General Public License version 3 (GPL v3)'
__version__ = '0.6'
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
__license__ = "GNU General Public License version 3 (GPL v3)"
__version__ = "0.6"
class __Ghostscript(object):
@ -87,10 +87,13 @@ def Ghostscript(*args, **kwargs):
# Ghostscript only supports a single instance
if __instance__ is None:
__instance__ = gs.new_instance()
return __Ghostscript(__instance__, args,
stdin=kwargs.get('stdin', None),
stdout=kwargs.get('stdout', None),
stderr=kwargs.get('stderr', None))
return __Ghostscript(
__instance__,
args,
stdin=kwargs.get("stdin", None),
stdout=kwargs.get("stdout", None),
stderr=kwargs.get("stderr", None),
)
__instance__ = None

View File

@ -42,10 +42,10 @@ e_Info = -110
#
e_Quit = -101
__author__ = 'Hartmut Goebel <h.goebel@crazy-compilers.com>'
__copyright__ = 'Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>'
__license__ = 'GNU General Public License version 3 (GPL v3)'
__version__ = '0.6'
__author__ = "Hartmut Goebel <h.goebel@crazy-compilers.com>"
__copyright__ = "Copyright 2010-2018 by Hartmut Goebel <h.goebel@crazy-compilers.com>"
__license__ = "GNU General Public License version 3 (GPL v3)"
__version__ = "0.6"
gs_main_instance = c_void_p
display_callback = c_void_p
@ -55,7 +55,7 @@ display_callback = c_void_p
class GhostscriptError(Exception):
def __init__(self, ecode):
self.code = ecode
self.code = ecode
def new_instance():
@ -89,6 +89,7 @@ def _wrap_stdin(infp):
"""Wrap a filehandle into a C function to be used as `stdin` callback
for ``set_stdio``. The filehandle has to support the readline() method.
"""
def _wrap(instance, dest, count):
try:
data = infp.readline(count)
@ -110,6 +111,7 @@ def _wrap_stdout(outfp):
`stderr` callback for ``set_stdio``. The filehandle has to support the
write() and flush() methods.
"""
def _wrap(instance, str, count):
outfp.write(str[:count])
outfp.flush()
@ -187,11 +189,23 @@ def __win32_finddll():
import winreg
except ImportError:
# assume Python 2
from _winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
QueryInfoKey, HKEY_LOCAL_MACHINE
from _winreg import (
OpenKey,
CloseKey,
EnumKey,
QueryValueEx,
QueryInfoKey,
HKEY_LOCAL_MACHINE,
)
else:
from winreg import OpenKey, CloseKey, EnumKey, QueryValueEx, \
QueryInfoKey, HKEY_LOCAL_MACHINE
from winreg import (
OpenKey,
CloseKey,
EnumKey,
QueryValueEx,
QueryInfoKey,
HKEY_LOCAL_MACHINE,
)
from distutils.version import LooseVersion
import os
@ -199,15 +213,19 @@ def __win32_finddll():
dlls = []
# Look up different variants of Ghostscript and take the highest
# version for which the DLL is to be found in the filesystem.
for key_name in ('AFPL Ghostscript', 'Aladdin Ghostscript',
'GNU Ghostscript', 'GPL Ghostscript'):
for key_name in (
"AFPL Ghostscript",
"Aladdin Ghostscript",
"GNU Ghostscript",
"GPL Ghostscript",
):
try:
k1 = OpenKey(HKEY_LOCAL_MACHINE, "Software\\%s" % key_name)
for num in range(0, QueryInfoKey(k1)[0]):
version = EnumKey(k1, num)
try:
k2 = OpenKey(k1, version)
dll_path = QueryValueEx(k2, 'GS_DLL')[0]
dll_path = QueryValueEx(k2, "GS_DLL")[0]
CloseKey(k2)
if os.path.exists(dll_path):
dlls.append((LooseVersion(version), dll_path))
@ -223,21 +241,21 @@ def __win32_finddll():
return None
if sys.platform == 'win32':
if sys.platform == "win32":
libgs = __win32_finddll()
if not libgs:
raise RuntimeError('Please make sure that Ghostscript is installed')
raise RuntimeError("Please make sure that Ghostscript is installed")
libgs = windll.LoadLibrary(libgs)
else:
try:
libgs = cdll.LoadLibrary('libgs.so')
libgs = cdll.LoadLibrary("libgs.so")
except OSError:
# shared object file not found
import ctypes.util
libgs = ctypes.util.find_library('gs')
libgs = ctypes.util.find_library("gs")
if not libgs:
raise RuntimeError('Please make sure that Ghostscript is installed')
raise RuntimeError("Please make sure that Ghostscript is installed")
libgs = cdll.LoadLibrary(libgs)
del __win32_finddll

View File

@ -7,8 +7,14 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation, is_url, download_url)
from .utils import (
TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
)
class PDFHandler(object):
@ -27,19 +33,20 @@ class PDFHandler(object):
Password for decryption.
"""
def __init__(self, filepath, pages='1', password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
if password is None:
self.password = ''
self.password = ""
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
@ -60,26 +67,26 @@ class PDFHandler(object):
"""
page_numbers = []
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
infile = PdfFileReader(open(filepath, "rb"), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
if pages == "all":
page_numbers.append({"start": 1, "end": infile.getNumPages()})
else:
for r in pages.split(','):
if '-' in r:
a, b = r.split('-')
if b == 'end':
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = infile.getNumPages()
page_numbers.append({'start': int(a), 'end': int(b)})
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({'start': int(r), 'end': int(r)})
page_numbers.append({"start": int(r), "end": int(r)})
P = []
for p in page_numbers:
P.extend(range(p['start'], p['end'] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
@ -95,16 +102,16 @@ class PDFHandler(object):
Tmp directory.
"""
with open(filepath, 'rb') as fileobj:
with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, 'wb') as f:
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
@ -112,23 +119,25 @@ class PDFHandler(object):
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'anticlockwise':
if rotation == "anticlockwise":
p.rotateClockwise(90)
elif rotation == 'clockwise':
elif rotation == "clockwise":
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, 'wb') as f:
with open(fpath, "wb") as f:
outfile.write(f)
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
@ -154,11 +163,13 @@ class PDFHandler(object):
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs)
t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
return TableList(sorted(tables))

View File

@ -39,17 +39,23 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
if process_background:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c)
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray), 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
np.invert(gray),
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blocksize,
c,
)
return img, threshold
def find_lines(threshold, regions=None, direction='horizontal',
line_scale=15, iterations=0):
def find_lines(
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
@ -87,15 +93,14 @@ def find_lines(threshold, regions=None, direction='horizontal',
"""
lines = []
if direction == 'vertical':
if direction == "vertical":
size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal':
elif direction == "horizontal":
size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
if regions is not None:
region_mask = np.zeros(threshold.shape)
@ -110,19 +115,21 @@ def find_lines(threshold, regions=None, direction='horizontal',
try:
_, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
for c in contours:
x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w
y1, y2 = y, y + h
if direction == 'vertical':
if direction == "vertical":
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
elif direction == 'horizontal':
elif direction == "horizontal":
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
return dmask, lines
@ -150,11 +157,13 @@ def find_contours(vertical, horizontal):
try:
__, contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
# sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
@ -196,11 +205,13 @@ def find_joints(contours, vertical, horizontal):
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
if len(jc) <= 4: # remove contours with less than 4 joints
continue
joint_coords = []

View File

@ -6,8 +6,15 @@ from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
suppress_stdout=False, layout_kwargs={}, **kwargs):
def read_pdf(
filepath,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs={},
**kwargs
):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -91,9 +98,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
tables : camelot.core.TableList
"""
if flavor not in ['lattice', 'stream']:
raise NotImplementedError("Unknown flavor specified."
" Use either 'lattice' or 'stream'")
if flavor not in ["lattice", "stream"]:
raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)
with warnings.catch_warnings():
if suppress_stdout:
@ -102,6 +110,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs, **kwargs)
tables = p.parse(
flavor=flavor,
suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs,
**kwargs
)
return tables

View File

@ -8,13 +8,13 @@ from ..utils import get_page_layout, get_text_objects
class BaseParser(object):
"""Defines a base parser.
"""
def _generate_layout(self, filename, layout_kwargs):
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs)
self.images = get_text_objects(self.layout, ltype='image')
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)

View File

@ -14,14 +14,25 @@ import pandas as pd
from .base import BaseParser
from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines,
find_contours, find_joints)
from ..utils import (
scale_image,
scale_pdf,
segments_in_bbox,
text_in_bbox,
merge_close_lines,
get_table_index,
compute_accuracy,
compute_whitespace,
)
from ..image_processing import (
adaptive_threshold,
find_lines,
find_contours,
find_joints,
)
logger = logging.getLogger('camelot')
logger = logging.getLogger("camelot")
class Lattice(BaseParser):
@ -83,11 +94,26 @@ class Lattice(BaseParser):
Resolution used for PDF to PNG conversion.
"""
def __init__(self, table_regions=None, table_areas=None, process_background=False,
line_scale=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs):
def __init__(
self,
table_regions=None,
table_areas=None,
process_background=False,
line_scale=15,
copy_text=None,
shift_text=["l", "t"],
split_text=False,
flag_size=False,
strip_text="",
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
threshold_constant=-2,
iterations=0,
resolution=300,
**kwargs
):
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
@ -130,19 +156,19 @@ class Lattice(BaseParser):
indices = []
for r_idx, c_idx, text in idx:
for d in shift_text:
if d == 'l':
if d == "l":
if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if d == 'r':
if d == "r":
if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if d == 't':
if d == "t":
if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
if d == 'b':
if d == "b":
if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
@ -171,13 +197,13 @@ class Lattice(BaseParser):
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == '':
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == '':
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].text = t.cells[i - 1][j].text
return t
@ -185,11 +211,12 @@ class Lattice(BaseParser):
def _generate_image(self):
from ..ext.ghostscript import Ghostscript
self.imagename = ''.join([self.rootname, '.png'])
gs_call = '-q -sDEVICE=png16m -o {} -r300 {}'.format(
self.imagename, self.filename)
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, 'wb')
null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null) as gs:
pass
null.close()
@ -208,8 +235,11 @@ class Lattice(BaseParser):
return scaled_areas
self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background,
blocksize=self.threshold_blocksize, c=self.threshold_constant)
self.imagename,
process_background=self.process_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)
image_width = self.image.shape[1]
image_height = self.image.shape[0]
@ -226,21 +256,35 @@ class Lattice(BaseParser):
regions = scale_areas(self.table_regions)
vertical_mask, vertical_segments = find_lines(
self.threshold, regions=regions, direction='vertical',
line_scale=self.line_scale, iterations=self.iterations)
self.threshold,
regions=regions,
direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, regions=regions, direction='horizontal',
line_scale=self.line_scale, iterations=self.iterations)
self.threshold,
regions=regions,
direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
contours = find_contours(vertical_mask, horizontal_mask)
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
else:
vertical_mask, vertical_segments = find_lines(
self.threshold, direction='vertical', line_scale=self.line_scale,
iterations=self.iterations)
self.threshold,
direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal', line_scale=self.line_scale,
iterations=self.iterations)
self.threshold,
direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
@ -248,18 +292,20 @@ class Lattice(BaseParser):
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers)
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
)
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
t_bbox = {}
v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments)
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
tk, self.vertical_segments, self.horizontal_segments
)
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
@ -268,23 +314,19 @@ class Lattice(BaseParser):
cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(
sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True), line_tol=self.line_tol)
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs):
v_s = kwargs.get('v_s')
h_s = kwargs.get('h_s')
v_s = kwargs.get("v_s")
h_s = kwargs.get("h_s")
if v_s is None or h_s is None:
raise ValueError('No segments found on {}'.format(self.rootname))
raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows)
# set table edges to True using ver+hor lines
@ -297,14 +339,21 @@ class Lattice(BaseParser):
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']:
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size, strip_text=self.strip_text)
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
@ -317,11 +366,11 @@ class Lattice(BaseParser):
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = 'lattice'
table.flavor = "lattice"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
@ -337,15 +386,18 @@ class Lattice(BaseParser):
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text:
if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname))
)
return []
self._generate_image()
@ -353,8 +405,9 @@ class Lattice(BaseParser):
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(sorted(
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk

View File

@ -10,11 +10,10 @@ import pandas as pd
from .base import BaseParser
from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
logger = logging.getLogger('camelot')
logger = logging.getLogger("camelot")
class Stream(BaseParser):
@ -55,9 +54,20 @@ class Stream(BaseParser):
to generate columns.
"""
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
column_tol=0, **kwargs):
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
split_text=False,
flag_size=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
self.table_regions = table_regions
self.table_areas = table_areas
self.columns = columns
@ -150,8 +160,9 @@ class Stream(BaseParser):
else:
lower = merged[-1]
if column_tol >= 0:
if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=column_tol)):
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
@ -186,13 +197,14 @@ class Stream(BaseParser):
List of continuous row y-coordinate tuples.
"""
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
row_mids = [
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
for r in rows_grouped
]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return rows
@staticmethod
@ -217,8 +229,9 @@ class Stream(BaseParser):
if text:
text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r]
new_cols = [
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
]
cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols
@ -243,15 +256,13 @@ class Stream(BaseParser):
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
raise ValueError("Length of table_areas and columns" " should be equal")
def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm
@ -309,16 +320,16 @@ class Stream(BaseParser):
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
t_bbox = {}
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
@ -327,7 +338,7 @@ class Stream(BaseParser):
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(',')
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
@ -346,20 +357,29 @@ class Stream(BaseParser):
if len(elements):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn("No tables found in table area {}".format(
table_idx + 1))
warnings.warn(
"No tables found in table area {}".format(table_idx + 1)
)
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right])
outer_text = [t for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
@ -373,11 +393,16 @@ class Stream(BaseParser):
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']:
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size, strip_text=self.strip_text)
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
for r_idx, c_idx, text in indices:
@ -389,11 +414,11 @@ class Stream(BaseParser):
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = 'stream'
table.flavor = "stream"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
@ -409,23 +434,27 @@ class Stream(BaseParser):
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text:
if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname))
)
return []
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(sorted(
self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows)
table._bbox = tk

View File

@ -10,7 +10,7 @@ else:
class PlotMethods(object):
def __call__(self, table, kind='text', filename=None):
def __call__(self, table, kind="text", filename=None):
"""Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different
parameters to get the best output.
@ -31,14 +31,16 @@ class PlotMethods(object):
"""
if not _HAS_MPL:
raise ImportError('matplotlib is required for plotting.')
raise ImportError("matplotlib is required for plotting.")
if table.flavor == 'lattice' and kind in ['textedge']:
raise NotImplementedError("Lattice flavor does not support kind='{}'".format(
kind))
elif table.flavor == 'stream' and kind in ['joint', 'line']:
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
kind))
if table.flavor == "lattice" and kind in ["textedge"]:
raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind)
)
elif table.flavor == "stream" and kind in ["joint", "line"]:
raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind)
)
plot_method = getattr(self, kind)
return plot_method(table)
@ -57,18 +59,12 @@ class PlotMethods(object):
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig
@ -87,21 +83,17 @@ class PlotMethods(object):
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
for row in table.cells:
for cell in row:
if cell.left:
ax.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]])
ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
if cell.right:
ax.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]])
ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
if cell.top:
ax.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]])
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]])
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
return fig
def contour(self, table):
@ -124,7 +116,7 @@ class PlotMethods(object):
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
if not _FOR_LATTICE:
@ -133,21 +125,14 @@ class PlotMethods(object):
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
color='blue'
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
)
)
for t in table_bbox.keys():
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
fill=False,
color='red'
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
)
)
if not _FOR_LATTICE:
@ -173,25 +158,19 @@ class PlotMethods(object):
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
color='blue'
)
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
for te in table._textedges:
ax.plot([te.x, te.x],
[te.y0, te.y1])
ax.plot([te.x, te.x], [te.y0, te.y1])
return fig
@ -210,14 +189,14 @@ class PlotMethods(object):
"""
img, table_bbox = table._image
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
ax.plot(x_coord, y_coord, 'ro')
ax.plot(x_coord, y_coord, "ro")
ax.imshow(img)
return fig
@ -235,7 +214,7 @@ class PlotMethods(object):
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax = fig.add_subplot(111, aspect="equal")
vertical, horizontal = table._segments
for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]])

View File

@ -19,8 +19,14 @@ from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical, LTImage)
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTTextLineHorizontal,
LTTextLineVertical,
LTImage,
)
PY3 = sys.version_info[0] >= 3
@ -35,7 +41,7 @@ else:
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
_VALID_URLS.discard("")
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
@ -59,9 +65,11 @@ def is_url(url):
def random_string(length):
ret = ''
ret = ""
while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
ret += random.choice(
string.digits + string.ascii_lowercase + string.ascii_uppercase
)
length -= 1
return ret
@ -79,14 +87,14 @@ def download_url(url):
Temporary filepath.
"""
filename = '{}.pdf'.format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
filename = "{}.pdf".format(random_string(6))
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
obj = urlopen(url)
if PY3:
content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader('Content-Type')
if content_type != 'application/pdf':
content_type = obj.info().getheader("Content-Type")
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
@ -94,39 +102,38 @@ def download_url(url):
return filepath
stream_kwargs = [
'columns',
'row_tol',
'column_tol'
]
stream_kwargs = ["columns", "row_tol", "column_tol"]
lattice_kwargs = [
'process_background',
'line_scale',
'copy_text',
'shift_text',
'line_tol',
'joint_tol',
'threshold_blocksize',
'threshold_constant',
'iterations'
"process_background",
"line_scale",
"copy_text",
"shift_text",
"line_tol",
"joint_tol",
"threshold_blocksize",
"threshold_constant",
"iterations",
]
def validate_input(kwargs, flavor='lattice'):
def validate_input(kwargs, flavor="lattice"):
def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
raise ValueError("{} cannot be used with flavor='{}'".format(
",".join(sorted(isec)), flavor))
raise ValueError(
"{} cannot be used with flavor='{}'".format(
",".join(sorted(isec)), flavor
)
)
if flavor == 'lattice':
if flavor == "lattice":
check_intersection(stream_kwargs, kwargs)
else:
check_intersection(lattice_kwargs, kwargs)
def remove_extra(kwargs, flavor='lattice'):
if flavor == 'lattice':
def remove_extra(kwargs, flavor="lattice"):
if flavor == "lattice":
for key in kwargs.keys():
if key in stream_kwargs:
kwargs.pop(key)
@ -256,15 +263,19 @@ def scale_image(tables, v_segments, h_segments, factors):
v_segments_new = []
for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
abs(translate(-img_y, v[3])), scaling_factor_y)
y1, y2 = (
scale(abs(translate(-img_y, v[1])), scaling_factor_y),
scale(abs(translate(-img_y, v[3])), scaling_factor_y),
)
v_segments_new.append((x1, y1, x2, y2))
h_segments_new = []
for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
abs(translate(-img_y, h[3])), scaling_factor_y)
y1, y2 = (
scale(abs(translate(-img_y, h[1])), scaling_factor_y),
scale(abs(translate(-img_y, h[3])), scaling_factor_y),
)
h_segments_new.append((x1, y1, x2, y2))
return tables_new, v_segments_new, h_segments_new
@ -291,13 +302,13 @@ def get_rotation(chars, horizontal_text, vertical_text):
rotated 90 degree clockwise.
"""
rotation = ''
rotation = ""
hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
return rotation
@ -325,10 +336,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
v_s = [
v
for v in v_segments
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
]
h_s = [
h
for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
]
return v_s, h_s
@ -351,9 +368,12 @@ def text_in_bbox(bbox, text):
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
<= rt[1] + 2]
t_bbox = [
t
for t in text
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
]
return t_bbox
@ -390,7 +410,7 @@ def merge_close_lines(ar, line_tol=2):
# (inspired from sklearn.pipeline.Pipeline)
def flag_font_size(textline, direction, strip_text=''):
def flag_font_size(textline, direction, strip_text=""):
"""Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.
@ -409,10 +429,18 @@ def flag_font_size(textline, direction, strip_text=''):
fstring : string
"""
if direction == 'horizontal':
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
elif direction == 'vertical':
d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
if direction == "horizontal":
d = [
(t.get_text(), np.round(t.height, decimals=6))
for t in textline
if not isinstance(t, LTAnno)
]
elif direction == "vertical":
d = [
(t.get_text(), np.round(t.width, decimals=6))
for t in textline
if not isinstance(t, LTAnno)
]
l = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1:
flist = []
@ -420,21 +448,21 @@ def flag_font_size(textline, direction, strip_text=''):
for key, chars in groupby(d, itemgetter(1)):
if key == min_size:
fchars = [t[0] for t in chars]
if ''.join(fchars).strip():
fchars.insert(0, '<s>')
fchars.append('</s>')
flist.append(''.join(fchars))
if "".join(fchars).strip():
fchars.insert(0, "<s>")
fchars.append("</s>")
flist.append("".join(fchars))
else:
fchars = [t[0] for t in chars]
if ''.join(fchars).strip():
flist.append(''.join(fchars))
fstring = ''.join(flist).strip(strip_text)
if "".join(fchars).strip():
flist.append("".join(fchars))
fstring = "".join(flist).strip(strip_text)
else:
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
return fstring
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
@ -464,19 +492,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
cut_text = []
bbox = textline.bbox
try:
if direction == 'horizontal' and not textline.is_empty():
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
if direction == "horizontal" and not textline.is_empty():
x_overlap = [
i
for i, x in enumerate(table.cols)
if x[0] <= bbox[2] and bbox[0] <= x[1]
]
r_idx = [
j
for j, r in enumerate(table.rows)
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
]
r = r_idx[0]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
(obj.x0 + obj.x1) / 2 <= cut[1]):
if (
row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
and (obj.x0 + obj.x1) / 2 <= cut[1]
):
cut_text.append((r, cut[0], obj))
break
else:
@ -485,19 +525,31 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
cut_text.append((r, cut[0] + 1, obj))
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
elif direction == 'vertical' and not textline.is_empty():
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
elif direction == "vertical" and not textline.is_empty():
y_overlap = [
j
for j, y in enumerate(table.rows)
if y[1] <= bbox[3] and bbox[1] <= y[0]
]
c_idx = [
i
for i, c in enumerate(table.cols)
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
]
c = c_idx[0]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
(obj.y0 + obj.y1) / 2 >= cut[1]):
if (
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj))
break
else:
@ -511,15 +563,24 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size:
grouped_chars.append((key[0], key[1],
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
grouped_chars.append(
(
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
),
)
)
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
return grouped_chars
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
@ -558,8 +619,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
"""
r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)):
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
(t.y0 + t.y1) / 2.0 > table.rows[r][1]):
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
r
][1]:
lt_col_overlap = []
for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0:
@ -569,11 +631,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
else:
lt_col_overlap.append(-1)
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
text = t.get_text().strip('\n')
text = t.get_text().strip("\n")
text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1])
warnings.warn("{} {} does not lie in column range {}".format(
text, text_range, col_range))
warnings.warn(
"{} {} does not lie in column range {}".format(
text, text_range, col_range
)
)
r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap))
break
@ -594,10 +659,24 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text:
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
return (
split_textline(
table, t, direction, flag_size=flag_size, strip_text=strip_text
),
error,
)
else:
if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
return (
[
(
r_idx,
c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text),
)
],
error,
)
else:
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
@ -650,14 +729,20 @@ def compute_whitespace(d):
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == '':
if j.strip() == "":
whitespace += 1
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
return whitespace
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
def get_page_layout(
filename,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1,
detect_vertical=True,
all_texts=True,
):
"""Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs.
@ -680,16 +765,18 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
Dimension of pdf page in the form (width, height).
"""
with open(filename, 'rb') as f:
with open(filename, "rb") as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin,
detect_vertical=detect_vertical,
all_texts=all_texts)
laparams = LAParams(
char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin,
detect_vertical=detect_vertical,
all_texts=all_texts,
)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
@ -721,13 +808,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects.
"""
if ltype == 'char':
if ltype == "char":
LTObject = LTChar
elif ltype == 'image':
elif ltype == "image":
LTObject = LTImage
elif ltype == 'horizontal_text':
elif ltype == "horizontal_text":
LTObject = LTTextLineHorizontal
elif ltype == 'vertical_text':
elif ltype == "vertical_text":
LTObject = LTTextLineVertical
if t is None:
t = []