Merge 42f8321c8c into 4b08165328
126
camelot/cli.py
|
|
@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class Config(object):
|
class Config():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.config = {}
|
self.config = {}
|
||||||
|
|
||||||
|
|
@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
|
|
||||||
@click.group(name="camelot")
|
@click.group(name="camelot")
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
|
@click.option("-q", "--quiet", is_flag=False,
|
||||||
|
help="Suppress logs and warnings.")
|
||||||
@click.option(
|
@click.option(
|
||||||
"-p",
|
"-p",
|
||||||
"--pages",
|
"--pages",
|
||||||
|
|
@ -57,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
"-flag",
|
"-flag",
|
||||||
"--flag_size",
|
"--flag_size",
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="Flag text based on" " font size. Useful to detect super/subscripts.",
|
help="Flag text based on font size. Useful to detect super/subscripts.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-strip",
|
"-strip",
|
||||||
|
|
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
|
||||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-back", "--process_background", is_flag=True, help="Process background lines."
|
"-back", "--process_background", is_flag=True,
|
||||||
|
help="Process background lines."
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-scale",
|
"-scale",
|
||||||
|
|
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
|
||||||
"-l",
|
"-l",
|
||||||
"--line_tol",
|
"--line_tol",
|
||||||
default=2,
|
default=2,
|
||||||
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
|
help="Tolerance parameter used to merge close vertical"
|
||||||
|
" and horizontal lines.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-j",
|
"-j",
|
||||||
|
|
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
|
||||||
raise ImportError("matplotlib is required for plotting.")
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError("Please specify output file path using --output")
|
raise click.UsageError(
|
||||||
|
"Please specify output file path using --output")
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError("Please specify output file format using --format")
|
raise click.UsageError(
|
||||||
|
"Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
|
||||||
|
**kwargs
|
||||||
)
|
)
|
||||||
click.echo(f"Found {tables.n} tables")
|
click.echo(f"Found {tables.n} tables")
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
|
|
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
|
||||||
"-r",
|
"-r",
|
||||||
"--row_tol",
|
"--row_tol",
|
||||||
default=2,
|
default=2,
|
||||||
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
|
help="Tolerance parameter"
|
||||||
|
" used to combine text vertically, to generate rows.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-c",
|
"-c",
|
||||||
|
|
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
|
||||||
raise ImportError("matplotlib is required for plotting.")
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError("Please specify output file path using --output")
|
raise click.UsageError(
|
||||||
|
"Please specify output file path using --output")
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError("Please specify output file format using --format")
|
raise click.UsageError(
|
||||||
|
"Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
||||||
|
|
@ -302,3 +311,98 @@ def stream(c, *args, **kwargs):
|
||||||
plt.show()
|
plt.show()
|
||||||
else:
|
else:
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command("network")
|
||||||
|
@click.option(
|
||||||
|
"-R",
|
||||||
|
"--table_regions",
|
||||||
|
default=[],
|
||||||
|
multiple=True,
|
||||||
|
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||||
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-T",
|
||||||
|
"--table_areas",
|
||||||
|
default=[],
|
||||||
|
multiple=True,
|
||||||
|
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||||
|
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-C",
|
||||||
|
"--columns",
|
||||||
|
default=[],
|
||||||
|
multiple=True,
|
||||||
|
help="X coordinates of column separators.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-e",
|
||||||
|
"--edge_tol",
|
||||||
|
default=50,
|
||||||
|
help="Tolerance parameter" " for extending textedges vertically.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-r",
|
||||||
|
"--row_tol",
|
||||||
|
default=2,
|
||||||
|
help="Tolerance parameter"
|
||||||
|
" used to combine text vertically, to generate rows.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-c",
|
||||||
|
"--column_tol",
|
||||||
|
default=0,
|
||||||
|
help="Tolerance parameter"
|
||||||
|
" used to combine text horizontally, to generate columns.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-plot",
|
||||||
|
"--plot_type",
|
||||||
|
type=click.Choice(["text", "grid", "contour", "textedge"]),
|
||||||
|
help="Plot elements found on PDF page for visual debugging.",
|
||||||
|
)
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
|
@pass_config
|
||||||
|
def network(c, *args, **kwargs):
|
||||||
|
"""Use spaces between text to parse the table."""
|
||||||
|
conf = c.config
|
||||||
|
pages = conf.pop("pages")
|
||||||
|
output = conf.pop("output")
|
||||||
|
f = conf.pop("format")
|
||||||
|
compress = conf.pop("zip")
|
||||||
|
quiet = conf.pop("quiet")
|
||||||
|
plot_type = kwargs.pop("plot_type")
|
||||||
|
filepath = kwargs.pop("filepath")
|
||||||
|
kwargs.update(conf)
|
||||||
|
|
||||||
|
table_regions = list(kwargs["table_regions"])
|
||||||
|
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||||
|
table_areas = list(kwargs["table_areas"])
|
||||||
|
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||||
|
columns = list(kwargs["columns"])
|
||||||
|
kwargs["columns"] = None if not columns else columns
|
||||||
|
|
||||||
|
if plot_type is not None:
|
||||||
|
if not _HAS_MPL:
|
||||||
|
raise ImportError("matplotlib is required for plotting.")
|
||||||
|
else:
|
||||||
|
if output is None:
|
||||||
|
raise click.UsageError(
|
||||||
|
"Please specify output file path using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError(
|
||||||
|
"Please specify output file format using --format")
|
||||||
|
|
||||||
|
tables = read_pdf(
|
||||||
|
filepath, pages=pages, flavor="network",
|
||||||
|
suppress_stdout=quiet, **kwargs
|
||||||
|
)
|
||||||
|
click.echo(f"Found {tables.n} tables")
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
plot(table, kind=plot_type)
|
||||||
|
plt.show()
|
||||||
|
else:
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
|
||||||
368
camelot/core.py
|
|
@ -4,12 +4,20 @@ import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import zipfile
|
import zipfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from itertools import chain
|
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from cv2 import cv2
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
get_index_closest_point,
|
||||||
|
get_textline_coords,
|
||||||
|
build_file_path_in_temp_dir,
|
||||||
|
export_pdf_as_png
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# minimum number of vertical textline intersections for a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
# to be considered valid
|
# to be considered valid
|
||||||
|
|
@ -18,14 +26,70 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
||||||
class TextEdge(object):
|
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
|
||||||
"""Defines a text edge coordinates relative to a left-bottom
|
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
||||||
origin. (PDF coordinate space)
|
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
||||||
|
|
||||||
|
|
||||||
|
class TextAlignment():
|
||||||
|
"""Represents a list of textlines sharing an alignment on a coordinate.
|
||||||
|
|
||||||
|
The alignment can be left/right/middle or top/bottom/center.
|
||||||
|
|
||||||
|
(PDF coordinate space)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x : float
|
coord : float
|
||||||
x-coordinate of the text edge.
|
coordinate of the initial text edge. Depending on the alignment
|
||||||
|
it could be a vertical or horizontal coordinate.
|
||||||
|
textline : obj
|
||||||
|
the original textline to start the alignment
|
||||||
|
align : str
|
||||||
|
Name of the alignment (e.g. "left", "top", etc)
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
coord : float
|
||||||
|
The coordinate aligned averaged out across textlines. It can be along
|
||||||
|
the x or y axis.
|
||||||
|
textlines : array
|
||||||
|
Array of textlines that demonstrate this alignment.
|
||||||
|
align : str
|
||||||
|
Name of the alignment (e.g. "left", "top", etc)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, coord, textline, align):
|
||||||
|
self.coord = coord
|
||||||
|
self.textlines = [textline]
|
||||||
|
self.align = align
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
text_inside = " | ".join(
|
||||||
|
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||||
|
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
||||||
|
f"textlines text='{text_inside}...'>"
|
||||||
|
|
||||||
|
def register_aligned_textline(self, textline, coord):
|
||||||
|
"""Update new textline to this alignment, adapting its average."""
|
||||||
|
# Increase the intersections for this segment, expand it up,
|
||||||
|
# and adjust the x based on the new value
|
||||||
|
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||||
|
float(len(self.textlines) + 1)
|
||||||
|
self.textlines.append(textline)
|
||||||
|
|
||||||
|
|
||||||
|
class TextEdge(TextAlignment):
|
||||||
|
"""Defines a text edge coordinates relative to a left-bottom
|
||||||
|
origin. (PDF coordinate space).
|
||||||
|
|
||||||
|
An edge is an alignment bounded over a segment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
coord : float
|
||||||
|
coordinate of the text edge. Can be x or y.
|
||||||
y0 : float
|
y0 : float
|
||||||
y-coordinate of bottommost point.
|
y-coordinate of bottommost point.
|
||||||
y1 : float
|
y1 : float
|
||||||
|
|
@ -35,101 +99,120 @@ class TextEdge(object):
|
||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
----------
|
----------
|
||||||
intersections: int
|
|
||||||
Number of intersections with horizontal text rows.
|
|
||||||
is_valid: bool
|
is_valid: bool
|
||||||
A text edge is valid if it intersections with at least
|
A text edge is valid if it intersects with at least
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, x, y0, y1, align="left"):
|
def __init__(self, coord, textline, align):
|
||||||
self.x = x
|
super().__init__(coord, textline, align)
|
||||||
self.y0 = y0
|
self.y0 = textline.y0
|
||||||
self.y1 = y1
|
self.y1 = textline.y1
|
||||||
self.align = align
|
|
||||||
self.intersections = 0
|
|
||||||
self.is_valid = False
|
self.is_valid = False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
x = round(self.x, 2)
|
x = round(self.coord, 2)
|
||||||
y0 = round(self.y0, 2)
|
y0 = round(self.y0, 2)
|
||||||
y1 = round(self.y1, 2)
|
y1 = round(self.y1, 2)
|
||||||
return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} valid={self.is_valid}>"
|
return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} " \
|
||||||
|
f"valid={self.is_valid}>"
|
||||||
|
|
||||||
def update_coords(self, x, y0, edge_tol=50):
|
def update_coords(self, x, textline, edge_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
the is_valid attribute.
|
the is_valid attribute.
|
||||||
"""
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=edge_tol):
|
if np.isclose(self.y0, textline.y0, atol=edge_tol):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.register_aligned_textline(textline, x)
|
||||||
self.y0 = y0
|
self.y0 = textline.y0
|
||||||
self.intersections += 1
|
|
||||||
# a textedge is valid only if it extends uninterrupted
|
# a textedge is valid only if it extends uninterrupted
|
||||||
# over a required number of textlines
|
# over a required number of textlines
|
||||||
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
self.is_valid = True
|
self.is_valid = True
|
||||||
|
|
||||||
|
|
||||||
class TextEdges(object):
|
class TextAlignments():
|
||||||
|
"""Defines a dict of text edges across reference alignments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, alignment_names):
|
||||||
|
# For each possible alignment, list of tuples coordinate/textlines
|
||||||
|
self._text_alignments = {}
|
||||||
|
for alignment_name in alignment_names:
|
||||||
|
self._text_alignments[alignment_name] = []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _create_new_text_alignment(coord, textline, align):
|
||||||
|
return TextAlignment(coord, textline, align)
|
||||||
|
|
||||||
|
def _update_alignment(self, alignment, coord, textline):
|
||||||
|
return NotImplemented
|
||||||
|
|
||||||
|
def _register_textline(self, textline):
|
||||||
|
"""Updates an existing text edge in the current dict.
|
||||||
|
"""
|
||||||
|
coords = get_textline_coords(textline)
|
||||||
|
for alignment_id, alignment_array in self._text_alignments.items():
|
||||||
|
coord = coords[alignment_id]
|
||||||
|
|
||||||
|
# Find the index of the closest existing element (or 0 if none)
|
||||||
|
idx_closest = get_index_closest_point(
|
||||||
|
coord, alignment_array, fn=lambda x: x.coord
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if the edges before/after are close enough
|
||||||
|
# that it can be considered aligned
|
||||||
|
idx_insert = None
|
||||||
|
if idx_closest is None:
|
||||||
|
idx_insert = 0
|
||||||
|
else:
|
||||||
|
coord_closest = alignment_array[idx_closest].coord
|
||||||
|
# Note: np.isclose is slow!
|
||||||
|
if coord - 0.5 < coord_closest < coord + 0.5:
|
||||||
|
self._update_alignment(
|
||||||
|
alignment_array[idx_closest],
|
||||||
|
coord,
|
||||||
|
textline
|
||||||
|
)
|
||||||
|
elif coord_closest < coord:
|
||||||
|
idx_insert = idx_closest + 1
|
||||||
|
else:
|
||||||
|
idx_insert = idx_closest
|
||||||
|
if idx_insert is not None:
|
||||||
|
new_alignment = self._create_new_text_alignment(
|
||||||
|
coord, textline, alignment_id
|
||||||
|
)
|
||||||
|
alignment_array.insert(idx_insert, new_alignment)
|
||||||
|
|
||||||
|
|
||||||
|
class TextEdges(TextAlignments):
|
||||||
"""Defines a dict of left, right and middle text edges found on
|
"""Defines a dict of left, right and middle text edges found on
|
||||||
the PDF page. The dict has three keys based on the alignments,
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
and each key's value is a list of camelot.core.TextEdge objects.
|
and each key's value is a list of camelot.core.TextEdge objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, edge_tol=50):
|
def __init__(self, edge_tol=50):
|
||||||
|
super().__init__(HORIZONTAL_ALIGNMENTS)
|
||||||
self.edge_tol = edge_tol
|
self.edge_tol = edge_tol
|
||||||
self._textedges = {"left": [], "right": [], "middle": []}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_x_coord(textline, align):
|
def _create_new_text_alignment(coord, textline, align):
|
||||||
"""Returns the x coordinate of a text row based on the
|
# In TextEdges, each alignment is a TextEdge
|
||||||
specified alignment.
|
return TextEdge(coord, textline, align)
|
||||||
"""
|
|
||||||
x_left = textline.x0
|
|
||||||
x_right = textline.x1
|
|
||||||
x_middle = x_left + (x_right - x_left) / 2.0
|
|
||||||
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
|
|
||||||
return x_coord[align]
|
|
||||||
|
|
||||||
def find(self, x_coord, align):
|
def add(self, coord, textline, align):
|
||||||
"""Returns the index of an existing text edge using
|
"""Adds a new text edge to the current dict."""
|
||||||
the specified x coordinate and alignment.
|
te = self._create_new_text_alignment(coord, textline, align)
|
||||||
"""
|
self._text_alignments[align].append(te)
|
||||||
for i, te in enumerate(self._textedges[align]):
|
|
||||||
if np.isclose(te.x, x_coord, atol=0.5):
|
|
||||||
return i
|
|
||||||
return None
|
|
||||||
|
|
||||||
def add(self, textline, align):
|
def _update_alignment(self, alignment, coord, textline):
|
||||||
"""Adds a new text edge to the current dict.
|
alignment.update_coords(coord, textline, self.edge_tol)
|
||||||
"""
|
|
||||||
x = self.get_x_coord(textline, align)
|
|
||||||
y0 = textline.y0
|
|
||||||
y1 = textline.y1
|
|
||||||
te = TextEdge(x, y0, y1, align=align)
|
|
||||||
self._textedges[align].append(te)
|
|
||||||
|
|
||||||
def update(self, textline):
|
|
||||||
"""Updates an existing text edge in the current dict.
|
|
||||||
"""
|
|
||||||
for align in ["left", "right", "middle"]:
|
|
||||||
x_coord = self.get_x_coord(textline, align)
|
|
||||||
idx = self.find(x_coord, align)
|
|
||||||
if idx is None:
|
|
||||||
self.add(textline, align)
|
|
||||||
else:
|
|
||||||
self._textedges[align][idx].update_coords(
|
|
||||||
x_coord, textline.y0, edge_tol=self.edge_tol
|
|
||||||
)
|
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
"""Generates the text edges dict based on horizontal text
|
"""Generates the text edges dict based on horizontal text rows."""
|
||||||
rows.
|
|
||||||
"""
|
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update(tl)
|
self._register_textline(tl)
|
||||||
|
|
||||||
def get_relevant(self):
|
def get_relevant(self):
|
||||||
"""Returns the list of relevant text edges (all share the same
|
"""Returns the list of relevant text edges (all share the same
|
||||||
|
|
@ -138,13 +221,16 @@ class TextEdges(object):
|
||||||
"""
|
"""
|
||||||
intersections_sum = {
|
intersections_sum = {
|
||||||
"left": sum(
|
"left": sum(
|
||||||
te.intersections for te in self._textedges["left"] if te.is_valid
|
len(te.textlines) for te in self._text_alignments["left"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"right": sum(
|
"right": sum(
|
||||||
te.intersections for te in self._textedges["right"] if te.is_valid
|
len(te.textlines) for te in self._text_alignments["right"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"middle": sum(
|
"middle": sum(
|
||||||
te.intersections for te in self._textedges["middle"] if te.is_valid
|
len(te.textlines) for te in self._text_alignments["middle"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -152,7 +238,10 @@ class TextEdges(object):
|
||||||
# get vertical textedges that intersect maximum number of
|
# get vertical textedges that intersect maximum number of
|
||||||
# times with horizontal textlines
|
# times with horizontal textlines
|
||||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||||
return self._textedges[relevant_align]
|
return list(filter(
|
||||||
|
lambda te: te.is_valid,
|
||||||
|
self._text_alignments[relevant_align])
|
||||||
|
)
|
||||||
|
|
||||||
def get_table_areas(self, textlines, relevant_textedges):
|
def get_table_areas(self, textlines, relevant_textedges):
|
||||||
"""Returns a dict of interesting table areas on the PDF page
|
"""Returns a dict of interesting table areas on the PDF page
|
||||||
|
|
@ -168,31 +257,30 @@ class TextEdges(object):
|
||||||
return (x0, y0, x1, y1)
|
return (x0, y0, x1, y1)
|
||||||
|
|
||||||
# sort relevant textedges in reading order
|
# sort relevant textedges in reading order
|
||||||
relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
|
relevant_textedges.sort(key=lambda te: (-te.y0, te.coord))
|
||||||
|
|
||||||
table_areas = {}
|
table_areas = {}
|
||||||
for te in relevant_textedges:
|
for te in relevant_textedges:
|
||||||
if te.is_valid:
|
if not table_areas:
|
||||||
if not table_areas:
|
table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
|
||||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
else:
|
||||||
|
found = None
|
||||||
|
for area in table_areas:
|
||||||
|
# check for overlap
|
||||||
|
if te.y1 >= area[1] and te.y0 <= area[3]:
|
||||||
|
found = area
|
||||||
|
break
|
||||||
|
if found is None:
|
||||||
|
table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
|
||||||
else:
|
else:
|
||||||
found = None
|
table_areas.pop(found)
|
||||||
for area in table_areas:
|
updated_area = (
|
||||||
# check for overlap
|
found[0],
|
||||||
if te.y1 >= area[1] and te.y0 <= area[3]:
|
min(te.y0, found[1]),
|
||||||
found = area
|
max(found[2], te.coord),
|
||||||
break
|
max(found[3], te.y1),
|
||||||
if found is None:
|
)
|
||||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
table_areas[updated_area] = None
|
||||||
else:
|
|
||||||
table_areas.pop(found)
|
|
||||||
updated_area = (
|
|
||||||
found[0],
|
|
||||||
min(te.y0, found[1]),
|
|
||||||
max(found[2], te.x),
|
|
||||||
max(found[3], te.y1),
|
|
||||||
)
|
|
||||||
table_areas[updated_area] = None
|
|
||||||
|
|
||||||
# extend table areas based on textlines that overlap
|
# extend table areas based on textlines that overlap
|
||||||
# vertically. it's possible that these textlines were
|
# vertically. it's possible that these textlines were
|
||||||
|
|
@ -218,7 +306,8 @@ class TextEdges(object):
|
||||||
max(found[3], tl.y1),
|
max(found[3], tl.y1),
|
||||||
)
|
)
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
average_textline_height = sum_textline_height / float(len(textlines))
|
average_textline_height = sum_textline_height / \
|
||||||
|
float(len(textlines))
|
||||||
|
|
||||||
# add some padding to table areas
|
# add some padding to table areas
|
||||||
table_areas_padded = {}
|
table_areas_padded = {}
|
||||||
|
|
@ -228,7 +317,7 @@ class TextEdges(object):
|
||||||
return table_areas_padded
|
return table_areas_padded
|
||||||
|
|
||||||
|
|
||||||
class Cell(object):
|
class Cell():
|
||||||
"""Defines a cell in a table with coordinates relative to a
|
"""Defines a cell in a table with coordinates relative to a
|
||||||
left-bottom origin. (PDF coordinate space)
|
left-bottom origin. (PDF coordinate space)
|
||||||
|
|
||||||
|
|
@ -304,14 +393,13 @@ class Cell(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bound(self):
|
def bound(self):
|
||||||
"""The number of sides on which the cell is bounded.
|
"""The number of sides on which the cell is bounded."""
|
||||||
"""
|
|
||||||
return self.top + self.bottom + self.left + self.right
|
return self.top + self.bottom + self.left + self.right
|
||||||
|
|
||||||
|
|
||||||
class Table(object):
|
class Table():
|
||||||
"""Defines a table with coordinates relative to a left-bottom
|
"""Defines a table with coordinates relative to a left-bottom origin.
|
||||||
origin. (PDF coordinate space)
|
(PDF coordinate space)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -331,6 +419,8 @@ class Table(object):
|
||||||
Accuracy with which text was assigned to the cell.
|
Accuracy with which text was assigned to the cell.
|
||||||
whitespace : float
|
whitespace : float
|
||||||
Percentage of whitespace in the table.
|
Percentage of whitespace in the table.
|
||||||
|
filename : str
|
||||||
|
Path of the original PDF
|
||||||
order : int
|
order : int
|
||||||
Table number on PDF page.
|
Table number on PDF page.
|
||||||
page : int
|
page : int
|
||||||
|
|
@ -341,13 +431,27 @@ class Table(object):
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
self.cols = cols
|
self.cols = cols
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
|
self.cells = [
|
||||||
|
[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
|
||||||
|
]
|
||||||
self.df = None
|
self.df = None
|
||||||
self.shape = (0, 0)
|
self.shape = (0, 0)
|
||||||
self.accuracy = 0
|
self.accuracy = 0
|
||||||
self.whitespace = 0
|
self.whitespace = 0
|
||||||
|
self.filename = None
|
||||||
self.order = None
|
self.order = None
|
||||||
self.page = None
|
self.page = None
|
||||||
|
self.flavor = None # Flavor of the parser used
|
||||||
|
self.pdf_size = None # Dimensions of the original PDF page
|
||||||
|
self._bbox = None # Bounding box in original document
|
||||||
|
self.parse = None # Parse information
|
||||||
|
self.parse_details = None # Field holding extra debug data
|
||||||
|
|
||||||
|
self._image = None
|
||||||
|
self._image_path = None # Temporary file to hold an image of the pdf
|
||||||
|
|
||||||
|
self._text = [] # List of text box coordinates
|
||||||
|
self.textlines = [] # List of actual textlines on the page
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<{self.__class__.__name__} shape={self.shape}>"
|
return f"<{self.__class__.__name__} shape={self.shape}>"
|
||||||
|
|
@ -356,8 +460,7 @@ class Table(object):
|
||||||
if self.page == other.page:
|
if self.page == other.page:
|
||||||
if self.order < other.order:
|
if self.order < other.order:
|
||||||
return True
|
return True
|
||||||
if self.page < other.page:
|
return self.page < other.page
|
||||||
return True
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def data(self):
|
def data(self):
|
||||||
|
|
@ -382,6 +485,19 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
def get_pdf_image(self):
|
||||||
|
"""Compute pdf image and cache it
|
||||||
|
"""
|
||||||
|
if self._image is None:
|
||||||
|
if self._image_path is None:
|
||||||
|
self._image_path = build_file_path_in_temp_dir(
|
||||||
|
os.path.basename(self.filename),
|
||||||
|
".png"
|
||||||
|
)
|
||||||
|
export_pdf_as_png(self.filename, self._image_path)
|
||||||
|
self._image = cv2.imread(self._image_path)
|
||||||
|
return self._image
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_all_edges(self):
|
||||||
"""Sets all table edges to True.
|
"""Sets all table edges to True.
|
||||||
"""
|
"""
|
||||||
|
|
@ -548,7 +664,7 @@ class Table(object):
|
||||||
bottom = cell.bottom
|
bottom = cell.bottom
|
||||||
if cell.bound == 4:
|
if cell.bound == 4:
|
||||||
continue
|
continue
|
||||||
elif cell.bound == 3:
|
if cell.bound == 3:
|
||||||
if not left and (right and top and bottom):
|
if not left and (right and top and bottom):
|
||||||
cell.hspan = True
|
cell.hspan = True
|
||||||
elif not right and (left and top and bottom):
|
elif not right and (left and top and bottom):
|
||||||
|
|
@ -578,7 +694,8 @@ class Table(object):
|
||||||
Output filepath.
|
Output filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
|
kw = {"encoding": "utf-8", "index": False, "header": False,
|
||||||
|
"quoting": 1}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
self.df.to_csv(path, **kw)
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
|
|
@ -615,6 +732,7 @@ class Table(object):
|
||||||
"encoding": "utf-8",
|
"encoding": "utf-8",
|
||||||
}
|
}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
|
# pylint: disable=abstract-class-instantiated
|
||||||
writer = pd.ExcelWriter(path)
|
writer = pd.ExcelWriter(path)
|
||||||
self.df.to_excel(writer, **kw)
|
self.df.to_excel(writer, **kw)
|
||||||
writer.save()
|
writer.save()
|
||||||
|
|
@ -653,8 +771,41 @@ class Table(object):
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def copy_spanning_text(self, copy_text=None):
|
||||||
|
"""Copies over text in empty spanning cells.
|
||||||
|
|
||||||
class TableList(object):
|
Parameters
|
||||||
|
----------
|
||||||
|
copy_text : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
for f in copy_text:
|
||||||
|
if f == "h":
|
||||||
|
for i, row in enumerate(self.cells):
|
||||||
|
for j, cell in enumerate(row):
|
||||||
|
if cell.text.strip() == "" and \
|
||||||
|
cell.hspan and \
|
||||||
|
not cell.left:
|
||||||
|
cell.text = self.cells[i][j - 1].text
|
||||||
|
elif f == "v":
|
||||||
|
for i, row in enumerate(self.cells):
|
||||||
|
for j, cell in enumerate(row):
|
||||||
|
if cell.text.strip() == "" and \
|
||||||
|
cell.vspan and \
|
||||||
|
not cell.top:
|
||||||
|
cell.text = self.cells[i - 1][j].text
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class TableList():
|
||||||
"""Defines a list of camelot.core.Table objects. Each table can
|
"""Defines a list of camelot.core.Table objects. Each table can
|
||||||
be accessed using its index.
|
be accessed using its index.
|
||||||
|
|
||||||
|
|
@ -734,10 +885,15 @@ class TableList(object):
|
||||||
self._compress_dir(**kwargs)
|
self._compress_dir(**kwargs)
|
||||||
elif f == "excel":
|
elif f == "excel":
|
||||||
filepath = os.path.join(dirname, basename)
|
filepath = os.path.join(dirname, basename)
|
||||||
|
# pylint: disable=abstract-class-instantiated
|
||||||
writer = pd.ExcelWriter(filepath)
|
writer = pd.ExcelWriter(filepath)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
sheet_name = f"page-{table.page}-table-{table.order}"
|
sheet_name = f"page-{table.page}-table-{table.order}"
|
||||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
table.df.to_excel(
|
||||||
|
writer,
|
||||||
|
sheet_name=sheet_name,
|
||||||
|
encoding="utf-8"
|
||||||
|
)
|
||||||
writer.save()
|
writer.save()
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,14 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import logging
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice, Network, Hybrid
|
||||||
from .utils import (
|
from .utils import (
|
||||||
TemporaryDirectory,
|
build_file_path_in_temp_dir,
|
||||||
get_page_layout,
|
get_page_layout,
|
||||||
get_text_objects,
|
get_text_objects,
|
||||||
get_rotation,
|
get_rotation,
|
||||||
|
|
@ -16,8 +17,17 @@ from .utils import (
|
||||||
download_url,
|
download_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("camelot")
|
||||||
|
|
||||||
class PDFHandler(object):
|
PARSERS = {
|
||||||
|
"lattice": Lattice,
|
||||||
|
"stream": Stream,
|
||||||
|
"network": Network,
|
||||||
|
"hybrid": Hybrid,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PDFHandler():
|
||||||
"""Handles all operations like temp directory creation, splitting
|
"""Handles all operations like temp directory creation, splitting
|
||||||
file into single page PDFs, parsing each PDF and then removing the
|
file into single page PDFs, parsing each PDF and then removing the
|
||||||
temp directory.
|
temp directory.
|
||||||
|
|
@ -31,10 +41,13 @@ class PDFHandler(object):
|
||||||
Example: '1,3,4' or '1,4-end' or 'all'.
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
password : str, optional (default: None)
|
password : str, optional (default: None)
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether the parser should store debug information during parsing.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath, pages="1", password=None):
|
def __init__(self, filepath, pages="1", password=None, debug=False):
|
||||||
|
self.debug = debug
|
||||||
if is_url(filepath):
|
if is_url(filepath):
|
||||||
filepath = download_url(filepath)
|
filepath = download_url(filepath)
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
|
|
@ -89,38 +102,54 @@ class PDFHandler(object):
|
||||||
P.extend(range(p["start"], p["end"] + 1))
|
P.extend(range(p["start"], p["end"] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filepath, page, temp):
|
def _read_pdf_page(self, page=1, layout_kwargs=None):
|
||||||
"""Saves specified page from PDF into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory. Removes
|
||||||
|
password protection and normalizes rotation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
|
||||||
Filepath or URL of the PDF file.
|
|
||||||
page : int
|
page : int
|
||||||
Page number.
|
Page number.
|
||||||
temp : str
|
layout_kwargs : dict, optional (default: {})
|
||||||
Tmp directory.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
|
||||||
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
layout : object
|
||||||
|
|
||||||
|
dimensions : tuple
|
||||||
|
The dimensions of the pdf page
|
||||||
|
|
||||||
|
filepath : str
|
||||||
|
The path of the single page PDF - either the original, or a
|
||||||
|
normalized version.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filepath, "rb") as fileobj:
|
layout_kwargs = layout_kwargs or {}
|
||||||
|
with open(self.filepath, "rb") as fileobj:
|
||||||
|
# Normalize the pdf file, but skip if it's not encrypted or has
|
||||||
|
# only one page.
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, f"page-{page}.pdf")
|
fpath = build_file_path_in_temp_dir(f"page-{page}.pdf")
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, dimensions = get_page_layout(
|
||||||
|
fpath, **layout_kwargs)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
chars = get_text_objects(layout, ltype="char")
|
chars = get_text_objects(layout, ltype="char")
|
||||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != "":
|
if rotation != "":
|
||||||
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
|
fpath_new = "".join(
|
||||||
|
[froot.replace("page", "p"), "_rotated", fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
|
|
@ -134,9 +163,13 @@ class PDFHandler(object):
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
layout, dimensions = get_page_layout(
|
||||||
|
fpath, **layout_kwargs)
|
||||||
|
return layout, dimensions, fpath
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
self, flavor="lattice", suppress_stdout=False,
|
||||||
|
layout_kwargs=None, **kwargs
|
||||||
):
|
):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
@ -144,12 +177,13 @@ class PDFHandler(object):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice' or 'stream').
|
The parsing method to use ('lattice', 'stream', 'network',
|
||||||
|
or 'hybrid').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
layout_kwargs : dict, optional (default: {})
|
layout_kwargs : dict, optional (default: {})
|
||||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -159,17 +193,24 @@ class PDFHandler(object):
|
||||||
List of tables found in PDF.
|
List of tables found in PDF.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
layout_kwargs = layout_kwargs or {}
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
|
||||||
for p in self.pages:
|
parser_obj = PARSERS[flavor]
|
||||||
self._save_page(self.filepath, p, tempdir)
|
parser = parser_obj(debug=self.debug, **kwargs)
|
||||||
pages = [
|
|
||||||
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
|
# Read the layouts/dimensions of each of the pages we need to
|
||||||
]
|
# parse. This might require creating a temporary .pdf.
|
||||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
for page_idx in self.pages:
|
||||||
for p in pages:
|
layout, dimensions, source_file = self._read_pdf_page(
|
||||||
t = parser.extract_tables(
|
page_idx,
|
||||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
layout_kwargs=layout_kwargs
|
||||||
)
|
)
|
||||||
tables.extend(t)
|
parser.prepare_page_parse(source_file, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs)
|
||||||
|
if not suppress_stdout:
|
||||||
|
rootname = os.path.basename(parser.rootname)
|
||||||
|
logger.info(f"Processing {rootname}")
|
||||||
|
t = parser.extract_tables()
|
||||||
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ import numpy as np
|
||||||
|
|
||||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
imagename : string
|
imagename : string
|
||||||
|
|
@ -16,21 +15,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
blocksize : int, optional (default: 15)
|
blocksize : int, optional (default: 15)
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
c : int, optional (default: -2)
|
c : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
img : object
|
img : object
|
||||||
numpy.ndarray representing the original image.
|
numpy.ndarray representing the original image.
|
||||||
threshold : object
|
threshold : object
|
||||||
numpy.ndarray representing the thresholded image.
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img = cv2.imread(imagename)
|
img = cv2.imread(imagename)
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
@ -56,7 +51,6 @@ def find_lines(
|
||||||
):
|
):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold : object
|
threshold : object
|
||||||
|
|
@ -70,14 +64,11 @@ def find_lines(
|
||||||
line_scale : int, optional (default: 15)
|
line_scale : int, optional (default: 15)
|
||||||
Factor by which the page dimensions will be divided to get
|
Factor by which the page dimensions will be divided to get
|
||||||
smallest length of lines that should be detected.
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
The larger this value, smaller the detected lines. Making it
|
The larger this value, smaller the detected lines. Making it
|
||||||
too large will lead to text being detected as lines.
|
too large will lead to text being detected as lines.
|
||||||
iterations : int, optional (default: 0)
|
iterations : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dmask : object
|
dmask : object
|
||||||
|
|
@ -87,7 +78,6 @@ def find_lines(
|
||||||
List of tuples representing vertical/horizontal lines with
|
List of tuples representing vertical/horizontal lines with
|
||||||
coordinates relative to a left-top origin in
|
coordinates relative to a left-top origin in
|
||||||
image coordinate space.
|
image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
|
|
@ -135,21 +125,18 @@ def find_lines(
|
||||||
|
|
||||||
def find_contours(vertical, horizontal):
|
def find_contours(vertical, horizontal):
|
||||||
"""Finds table boundaries using OpenCV's findContours.
|
"""Finds table boundaries using OpenCV's findContours.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
vertical : object
|
vertical : object
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
horizontal : object
|
horizontal : object
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cont : list
|
cont : list
|
||||||
List of tuples representing table boundaries. Each tuple is of
|
List of tuples representing table boundaries. Each tuple is of
|
||||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||||
h -> height in image coordinate space.
|
h -> height in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
mask = vertical + horizontal
|
mask = vertical + horizontal
|
||||||
|
|
||||||
|
|
@ -175,7 +162,6 @@ def find_contours(vertical, horizontal):
|
||||||
|
|
||||||
def find_joints(contours, vertical, horizontal):
|
def find_joints(contours, vertical, horizontal):
|
||||||
"""Finds joints/intersections present inside each table boundary.
|
"""Finds joints/intersections present inside each table boundary.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
contours : list
|
contours : list
|
||||||
|
|
@ -186,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
horizontal : object
|
horizontal : object
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
tables : dict
|
tables : dict
|
||||||
|
|
@ -194,7 +179,6 @@ def find_joints(contours, vertical, horizontal):
|
||||||
in that boundary as their value.
|
in that boundary as their value.
|
||||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||||
and (x2, y2) -> rt in image coordinate space.
|
and (x2, y2) -> rt in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
joints = np.multiply(vertical, horizontal)
|
joints = np.multiply(vertical, horizontal)
|
||||||
tables = {}
|
tables = {}
|
||||||
|
|
|
||||||
|
|
@ -7,14 +7,14 @@ from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(
|
def read_pdf(
|
||||||
filepath,
|
filepath,
|
||||||
pages="1",
|
pages="1",
|
||||||
password=None,
|
password=None,
|
||||||
flavor="lattice",
|
flavor="lattice",
|
||||||
suppress_stdout=False,
|
suppress_stdout=False,
|
||||||
layout_kwargs={},
|
layout_kwargs=None,
|
||||||
**kwargs
|
debug=False,
|
||||||
):
|
**kwargs):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -80,16 +80,16 @@ def read_pdf(
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
threshold_constant* : int, optional (default: -2)
|
threshold_constant* : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
iterations* : int, optional (default: 0)
|
iterations* : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
||||||
resolution* : int, optional (default: 300)
|
resolution* : int, optional (default: 300)
|
||||||
Resolution used for PDF to PNG conversion.
|
Resolution used for PDF to PNG conversion.
|
||||||
|
|
||||||
|
|
@ -98,9 +98,11 @@ def read_pdf(
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if flavor not in ["lattice", "stream"]:
|
layout_kwargs = layout_kwargs or {}
|
||||||
|
if flavor not in ["lattice", "stream", "network", "hybrid"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
"Unknown flavor specified."
|
||||||
|
" Use either 'lattice', 'stream', or 'network'"
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
@ -108,7 +110,7 @@ def read_pdf(
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
|
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(
|
tables = p.parse(
|
||||||
flavor=flavor,
|
flavor=flavor,
|
||||||
|
|
|
||||||
|
|
@ -2,3 +2,5 @@
|
||||||
|
|
||||||
from .stream import Stream
|
from .stream import Stream
|
||||||
from .lattice import Lattice
|
from .lattice import Lattice
|
||||||
|
from .network import Network
|
||||||
|
from .hybrid import Hybrid
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,484 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ..utils import get_page_layout, get_text_objects
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from ..utils import (
|
||||||
|
bbox_from_str,
|
||||||
|
compute_accuracy,
|
||||||
|
compute_whitespace,
|
||||||
|
get_text_objects,
|
||||||
|
get_table_index,
|
||||||
|
text_in_bbox,
|
||||||
|
)
|
||||||
|
from ..core import Table
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(object):
|
class BaseParser():
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parser_id,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
copy_text=None,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
shift_text=None,
|
||||||
|
flag_size=False,
|
||||||
|
debug=False):
|
||||||
|
self.id = parser_id
|
||||||
|
self.table_regions = table_regions
|
||||||
|
self.table_areas = table_areas
|
||||||
|
self.table_bbox_parses = {}
|
||||||
|
|
||||||
def _generate_layout(self, filename, layout_kwargs):
|
self.columns = None
|
||||||
|
self.copy_text = copy_text
|
||||||
|
self.split_text = split_text
|
||||||
|
self.strip_text = strip_text
|
||||||
|
self.shift_text = shift_text
|
||||||
|
|
||||||
|
self.flag_size = flag_size
|
||||||
|
|
||||||
|
self.rootname = None
|
||||||
|
self.t_bbox = None
|
||||||
|
|
||||||
|
# For plotting details of parsing algorithms
|
||||||
|
self.resolution = 300 # default plotting resolution of the PDF.
|
||||||
|
self.parse_details = {}
|
||||||
|
if not debug:
|
||||||
|
self.parse_details = None
|
||||||
|
|
||||||
|
def table_bboxes(self):
|
||||||
|
return sorted(
|
||||||
|
self.table_bbox_parses.keys(),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
|
self.layout = layout
|
||||||
|
self.dimensions = dimensions
|
||||||
|
self.page = page_idx
|
||||||
self.images = get_text_objects(self.layout, ltype="image")
|
self.images = get_text_objects(self.layout, ltype="image")
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
|
self.horizontal_text = get_text_objects(
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
|
self.layout,
|
||||||
|
ltype="horizontal_text"
|
||||||
|
)
|
||||||
|
self.vertical_text = get_text_objects(
|
||||||
|
self.layout,
|
||||||
|
ltype="vertical_text"
|
||||||
|
)
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
|
if self.parse_details is not None:
|
||||||
|
self.parse_details["table_regions"] = self.table_regions
|
||||||
|
self.parse_details["table_areas"] = self.table_areas
|
||||||
|
|
||||||
|
def _apply_regions_filter(self, textlines):
|
||||||
|
"""If regions have been specified, filter textlines to these regions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
textlines : list
|
||||||
|
list of textlines to be filtered
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
filtered_textlines : list of textlines within the regions specified
|
||||||
|
|
||||||
|
"""
|
||||||
|
filtered_textlines = []
|
||||||
|
if self.table_regions is None:
|
||||||
|
filtered_textlines.extend(textlines)
|
||||||
|
else:
|
||||||
|
for region_str in self.table_regions:
|
||||||
|
region_text = text_in_bbox(
|
||||||
|
bbox_from_str(region_str),
|
||||||
|
textlines
|
||||||
|
)
|
||||||
|
filtered_textlines.extend(region_text)
|
||||||
|
return filtered_textlines
|
||||||
|
|
||||||
|
def _document_has_no_text(self):
|
||||||
|
"""Detects image only documents and warns.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
has_no_text : bool
|
||||||
|
Whether the document doesn't have any text at all.
|
||||||
|
"""
|
||||||
|
if not self.horizontal_text:
|
||||||
|
rootname = os.path.basename(self.rootname)
|
||||||
|
if self.images:
|
||||||
|
warnings.warn(
|
||||||
|
"{rootname} is image-based, "
|
||||||
|
"camelot only works on text-based pages."
|
||||||
|
.format(rootname=rootname)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
"No tables found on {rootname}".format(rootname=rootname)
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _initialize_new_table(self, table_idx, bbox, cols, rows):
|
||||||
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_idx : int
|
||||||
|
Index of this table within the pdf page analyzed
|
||||||
|
bbox : set
|
||||||
|
bounding box of this table within the pdf page analyzed
|
||||||
|
cols : list
|
||||||
|
list of coordinate boundaries tuples (left, right)
|
||||||
|
rows : list
|
||||||
|
list of coordinate boundaries tuples (bottom, top)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
table = Table(cols, rows)
|
||||||
|
table.page = self.page
|
||||||
|
table.order = table_idx + 1
|
||||||
|
table._bbox = bbox
|
||||||
|
return table
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
cell. Only useful for some parsers (e.g. Lattice), base method is a
|
||||||
|
noop.
|
||||||
|
"""
|
||||||
|
return idx
|
||||||
|
|
||||||
|
def compute_parse_errors(self, table):
|
||||||
|
pos_errors = []
|
||||||
|
# TODO: have a single list in place of two directional ones?
|
||||||
|
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||||
|
for direction in ["vertical", "horizontal"]:
|
||||||
|
for t in self.t_bbox[direction]:
|
||||||
|
indices, error = get_table_index(
|
||||||
|
table,
|
||||||
|
t,
|
||||||
|
direction,
|
||||||
|
split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size,
|
||||||
|
strip_text=self.strip_text,
|
||||||
|
)
|
||||||
|
if indices[:2] != (-1, -1):
|
||||||
|
pos_errors.append(error)
|
||||||
|
indices = type(self)._reduce_index(
|
||||||
|
table,
|
||||||
|
indices,
|
||||||
|
shift_text=self.shift_text
|
||||||
|
)
|
||||||
|
for r_idx, c_idx, text in indices:
|
||||||
|
table.cells[r_idx][c_idx].text = text
|
||||||
|
return pos_errors
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
|
# Pure virtual, must be defined by the derived parser
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
|
# Pure virtual, must be defined by the derived parser
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def _generate_table_bbox(self):
|
||||||
|
# Pure virtual, must be defined by the derived parser
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def extract_tables(self):
|
||||||
|
if self._document_has_no_text():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Identify plausible areas within the doc where tables lie,
|
||||||
|
# populate table_bbox keys with these areas.
|
||||||
|
self._generate_table_bbox()
|
||||||
|
|
||||||
|
_tables = []
|
||||||
|
# sort tables based on y-coord
|
||||||
|
for table_idx, bbox in enumerate(self.table_bboxes()):
|
||||||
|
if self.columns is not None and self.columns[table_idx] != "":
|
||||||
|
# user has to input boundary columns too
|
||||||
|
# take (0, pdf_width) by default
|
||||||
|
# similar to else condition
|
||||||
|
# len can't be 1
|
||||||
|
user_cols = self.columns[table_idx].split(",")
|
||||||
|
user_cols = [float(c) for c in user_cols]
|
||||||
|
else:
|
||||||
|
user_cols = None
|
||||||
|
|
||||||
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||||
|
bbox,
|
||||||
|
user_cols
|
||||||
|
)
|
||||||
|
table = self._generate_table(
|
||||||
|
table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
|
_tables.append(table)
|
||||||
|
|
||||||
|
return _tables
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
table.flavor = self.id
|
||||||
|
table.filename = self.filename
|
||||||
|
table.parse = self.table_bbox_parses[table._bbox]
|
||||||
|
table.parse_details = self.parse_details
|
||||||
|
pos_errors = self.compute_parse_errors(table)
|
||||||
|
table.accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
|
if self.copy_text is not None:
|
||||||
|
table.copy_spanning_text(self.copy_text)
|
||||||
|
|
||||||
|
data = table.data
|
||||||
|
table.df = pd.DataFrame(data)
|
||||||
|
table.shape = table.df.shape
|
||||||
|
|
||||||
|
table.whitespace = compute_whitespace(data)
|
||||||
|
table.pdf_size = (self.pdf_width, self.pdf_height)
|
||||||
|
|
||||||
|
_text = []
|
||||||
|
_text.extend(
|
||||||
|
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
table.textlines = self.horizontal_text + self.vertical_text
|
||||||
|
|
||||||
|
|
||||||
|
class TextBaseParser(BaseParser):
|
||||||
|
"""Base class for all text parsers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parser_id,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=50,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(
|
||||||
|
parser_id,
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
flag_size=flag_size,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
self.columns = columns
|
||||||
|
self._validate_columns()
|
||||||
|
self.edge_tol = edge_tol
|
||||||
|
self.row_tol = row_tol
|
||||||
|
self.column_tol = column_tol
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _group_rows(text, row_tol=2):
|
||||||
|
"""Groups PDFMiner text objects into rows vertically
|
||||||
|
within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
|
||||||
|
"""
|
||||||
|
row_y = None
|
||||||
|
rows = []
|
||||||
|
temp = []
|
||||||
|
non_empty_text = [t for t in text if t.get_text().strip()]
|
||||||
|
for t in non_empty_text:
|
||||||
|
# is checking for upright necessary?
|
||||||
|
# if t.get_text().strip() and all([obj.upright \
|
||||||
|
# for obj in t._objs
|
||||||
|
# if type(obj) is LTChar]):
|
||||||
|
if row_y is None:
|
||||||
|
row_y = t.y0
|
||||||
|
elif not np.isclose(row_y, t.y0, atol=row_tol):
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
temp = []
|
||||||
|
# We update the row's bottom as we go, to be forgiving if there
|
||||||
|
# is a gradual change across multiple columns.
|
||||||
|
row_y = t.y0
|
||||||
|
temp.append(t)
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
return rows
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_columns(l, column_tol=0):
|
||||||
|
"""Merges column boundaries horizontally if they overlap
|
||||||
|
or lie within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
l : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
column_tol : int, optional (default: 0)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
List of merged column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
merged = []
|
||||||
|
for higher in l:
|
||||||
|
if not merged:
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
lower = merged[-1]
|
||||||
|
if column_tol >= 0:
|
||||||
|
if higher[0] <= lower[1] or np.isclose(
|
||||||
|
higher[0], lower[1], atol=column_tol
|
||||||
|
):
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
elif column_tol < 0:
|
||||||
|
if higher[0] <= lower[1]:
|
||||||
|
if np.isclose(higher[0], lower[1],
|
||||||
|
atol=abs(column_tol)):
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||||
|
"""Makes row coordinates continuous. For the row to "touch"
|
||||||
|
we split the existing gap between them in half.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
rows_grouped : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
text_y_max : int
|
||||||
|
text_y_min : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
List of continuous row y-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
row_boundaries = [
|
||||||
|
[
|
||||||
|
max(t.y1 for t in r),
|
||||||
|
min(t.y0 for t in r)
|
||||||
|
]
|
||||||
|
for r in rows_grouped
|
||||||
|
]
|
||||||
|
for i in range(0, len(row_boundaries)-1):
|
||||||
|
top_row = row_boundaries[i]
|
||||||
|
bottom_row = row_boundaries[i+1]
|
||||||
|
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
||||||
|
row_boundaries[0][0] = text_y_max
|
||||||
|
row_boundaries[-1][1] = text_y_min
|
||||||
|
return row_boundaries
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_columns(cols, text, row_tol):
|
||||||
|
"""Adds columns to existing list by taking into account
|
||||||
|
the text that lies outside the current column x-coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
ytol : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if text:
|
||||||
|
text = TextBaseParser._group_rows(text, row_tol=row_tol)
|
||||||
|
elements = [len(r) for r in text]
|
||||||
|
new_cols = [
|
||||||
|
(t.x0, t.x1)
|
||||||
|
for r in text if len(r) == max(elements)
|
||||||
|
for t in r
|
||||||
|
]
|
||||||
|
cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
|
||||||
|
return cols
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_columns(cols, text_x_min, text_x_max):
|
||||||
|
"""Makes column coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text_x_min : int
|
||||||
|
text_y_max : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
cols = sorted(cols)
|
||||||
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
|
cols.insert(0, text_x_min)
|
||||||
|
cols.append(text_x_max)
|
||||||
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def _validate_columns(self):
|
||||||
|
if self.table_areas is not None and self.columns is not None:
|
||||||
|
if len(self.table_areas) != len(self.columns):
|
||||||
|
raise ValueError("Length of table_areas and columns"
|
||||||
|
" should be equal")
|
||||||
|
|
||||||
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
|
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||||
|
table = table.set_all_edges()
|
||||||
|
self.record_parse_metadata(table)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
# for plotting
|
||||||
|
table._segments = None
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,235 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from ..utils import (
|
||||||
|
bboxes_overlap,
|
||||||
|
boundaries_to_split_lines,
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from .base import BaseParser
|
||||||
|
from .network import Network
|
||||||
|
from .lattice import Lattice
|
||||||
|
|
||||||
|
|
||||||
|
class Hybrid(BaseParser):
|
||||||
|
"""Defines a hybrid parser, leveraging both network and lattice parsers.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
table_areas : list, optional (default: None)
|
||||||
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
columns : list, optional (default: None)
|
||||||
|
List of column x-coordinates strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Split text that spans across multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Flag text based on font size. Useful to detect
|
||||||
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
edge_tol : int, optional (default: 50)
|
||||||
|
Tolerance parameter for extending textedges vertically.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to combine text vertically,
|
||||||
|
to generate rows.
|
||||||
|
column_tol : int, optional (default: 0)
|
||||||
|
Tolerance parameter used to combine text horizontally,
|
||||||
|
to generate columns.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=None,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(
|
||||||
|
"hybrid",
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
self.columns = columns # Columns settings impacts the hybrid table
|
||||||
|
self.network_parser = Network(
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
columns=columns,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
self.lattice_parser = Lattice(
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs):
|
||||||
|
super().prepare_page_parse(filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs)
|
||||||
|
self.network_parser.prepare_page_parse(
|
||||||
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||||
|
self.lattice_parser.prepare_page_parse(
|
||||||
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
|
parser = self.table_bbox_parses[bbox]
|
||||||
|
return parser._generate_columns_and_rows(bbox, table_idx)
|
||||||
|
|
||||||
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
|
parser = self.table_bbox_parses[bbox]
|
||||||
|
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
||||||
|
# Because hybrid can inject extraneous splits from both lattice and
|
||||||
|
# network, remove lines / cols that are completely empty.
|
||||||
|
table.df = table.df.replace('', np.nan)
|
||||||
|
table.df = table.df.dropna(axis=0, how="all")
|
||||||
|
table.df = table.df.dropna(axis=1, how="all")
|
||||||
|
table.df = table.df.replace(np.nan, '')
|
||||||
|
table.shape = table.df.shape
|
||||||
|
return table
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
||||||
|
""" Augment existing boundaries using provided hard splits.
|
||||||
|
|
||||||
|
Boundaries: |---| |-| |---------|
|
||||||
|
Splits: | | | |
|
||||||
|
Augmented: |-------|-----|-------|--|
|
||||||
|
"""
|
||||||
|
idx_boundaries = len(boundaries) - 1
|
||||||
|
idx_splits = len(splits) - 1
|
||||||
|
previous_boundary = None
|
||||||
|
while True:
|
||||||
|
if idx_splits < 0:
|
||||||
|
# No more splits to incorporate, we're done
|
||||||
|
break
|
||||||
|
split = splits[idx_splits]
|
||||||
|
|
||||||
|
if idx_boundaries < 0:
|
||||||
|
# Need to insert remaining splits
|
||||||
|
new_boundary = [split, boundaries[0][0]]
|
||||||
|
boundaries.insert(0, new_boundary)
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
else:
|
||||||
|
boundary = \
|
||||||
|
boundaries[idx_boundaries]
|
||||||
|
if boundary[1] < \
|
||||||
|
split + tolerance:
|
||||||
|
# The lattice column is further to the right of our
|
||||||
|
# col boundary. We move our left boundary to match.
|
||||||
|
boundary[1] = split
|
||||||
|
# And if there was another segment after, we make its
|
||||||
|
# right boundary match as well so that there's no gap
|
||||||
|
if previous_boundary is not None:
|
||||||
|
previous_boundary[0] = split
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
elif boundary[0] > \
|
||||||
|
split - tolerance:
|
||||||
|
# Our boundary is fully after the split, move on
|
||||||
|
idx_boundaries = idx_boundaries - 1
|
||||||
|
previous_boundary = boundary
|
||||||
|
if idx_boundaries < 0:
|
||||||
|
# If this is the last boundary to the left, set its
|
||||||
|
# edge at the split
|
||||||
|
boundary[0] = split
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
else:
|
||||||
|
# The split is inside our boundary: split it
|
||||||
|
new_boundary = [split, boundary[1]]
|
||||||
|
boundaries.insert(idx_boundaries + 1, new_boundary)
|
||||||
|
boundary[1] = split
|
||||||
|
previous_boundary = new_boundary
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
return boundaries
|
||||||
|
|
||||||
|
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
|
||||||
|
""" Identify splits that were only detected by lattice or by network
|
||||||
|
"""
|
||||||
|
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
||||||
|
lattice_cols = lattice_parse["col_anchors"]
|
||||||
|
|
||||||
|
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
||||||
|
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
||||||
|
|
||||||
|
# Favor network, but complete or adjust its columns based on the
|
||||||
|
# splits identified by lattice.
|
||||||
|
if network_cols_boundaries is None:
|
||||||
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||||
|
else:
|
||||||
|
network_cols_boundaries = self._augment_boundaries_with_splits(
|
||||||
|
network_cols_boundaries,
|
||||||
|
lattice_cols,
|
||||||
|
self.lattice_parser.joint_tol)
|
||||||
|
augmented_bbox = (
|
||||||
|
network_cols_boundaries[0][0],
|
||||||
|
min(lattice_bbox[1], network_bbox[1]),
|
||||||
|
network_cols_boundaries[-1][1],
|
||||||
|
max(lattice_bbox[3], network_bbox[3]),
|
||||||
|
)
|
||||||
|
network_bbox_data["cols_anchors"] = \
|
||||||
|
boundaries_to_split_lines(network_cols_boundaries)
|
||||||
|
|
||||||
|
del self.network_parser.table_bbox_parses[network_bbox]
|
||||||
|
self.network_parser.table_bbox_parses[augmented_bbox] = \
|
||||||
|
network_bbox_data
|
||||||
|
self.table_bbox_parses[augmented_bbox] = self.network_parser
|
||||||
|
|
||||||
|
def _generate_table_bbox(self):
|
||||||
|
# Collect bboxes from both parsers
|
||||||
|
self.lattice_parser._generate_table_bbox()
|
||||||
|
_lattice_bboxes = sorted(
|
||||||
|
self.lattice_parser.table_bbox_parses,
|
||||||
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||||
|
self.network_parser._generate_table_bbox()
|
||||||
|
_network_bboxes = sorted(
|
||||||
|
self.network_parser.table_bbox_parses,
|
||||||
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||||
|
|
||||||
|
# Merge the data from both processes
|
||||||
|
for lattice_bbox in _lattice_bboxes:
|
||||||
|
merged = False
|
||||||
|
|
||||||
|
for idx in range(len(_network_bboxes)-1, -1, -1):
|
||||||
|
network_bbox = _network_bboxes[idx]
|
||||||
|
if not bboxes_overlap(lattice_bbox, network_bbox):
|
||||||
|
continue
|
||||||
|
self._merge_bbox_analysis(lattice_bbox, network_bbox)
|
||||||
|
# network_bbox_data["cols_boundaries"]
|
||||||
|
del _network_bboxes[idx]
|
||||||
|
merged = True
|
||||||
|
if not merged:
|
||||||
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||||
|
|
||||||
|
# Add the bboxes from network that haven't been merged
|
||||||
|
for network_bbox in _network_bboxes:
|
||||||
|
self.table_bbox_parses[network_bbox] = self.network_parser
|
||||||
|
|
@ -1,27 +1,16 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import copy
|
|
||||||
import locale
|
|
||||||
import logging
|
|
||||||
import warnings
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
build_file_path_in_temp_dir,
|
||||||
|
export_pdf_as_png,
|
||||||
scale_image,
|
scale_image,
|
||||||
scale_pdf,
|
scale_pdf,
|
||||||
segments_in_bbox,
|
segments_in_bbox,
|
||||||
text_in_bbox,
|
text_in_bbox_per_axis,
|
||||||
merge_close_lines,
|
merge_close_lines,
|
||||||
get_table_index,
|
|
||||||
compute_accuracy,
|
|
||||||
compute_whitespace,
|
|
||||||
)
|
)
|
||||||
from ..image_processing import (
|
from ..image_processing import (
|
||||||
adaptive_threshold,
|
adaptive_threshold,
|
||||||
|
|
@ -31,9 +20,6 @@ from ..image_processing import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
|
||||||
|
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""Lattice method of parsing looks for lines between text
|
"""Lattice method of parsing looks for lines between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
@ -79,7 +65,7 @@ class Lattice(BaseParser):
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
threshold_constant : int, optional (default: -2)
|
threshold_constant : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
@ -95,39 +81,43 @@ class Lattice(BaseParser):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
process_background=False,
|
process_background=False,
|
||||||
line_scale=15,
|
line_scale=15,
|
||||||
copy_text=None,
|
copy_text=None,
|
||||||
shift_text=["l", "t"],
|
shift_text=None,
|
||||||
split_text=False,
|
split_text=False,
|
||||||
flag_size=False,
|
flag_size=False,
|
||||||
strip_text="",
|
strip_text="",
|
||||||
line_tol=2,
|
line_tol=2,
|
||||||
joint_tol=2,
|
joint_tol=2,
|
||||||
threshold_blocksize=15,
|
threshold_blocksize=15,
|
||||||
threshold_constant=-2,
|
threshold_constant=-2,
|
||||||
iterations=0,
|
iterations=0,
|
||||||
resolution=300,
|
resolution=300,
|
||||||
**kwargs
|
**kwargs):
|
||||||
):
|
super().__init__(
|
||||||
self.table_regions = table_regions
|
"lattice",
|
||||||
self.table_areas = table_areas
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
copy_text=copy_text,
|
||||||
|
shift_text=shift_text or ["l", "t"],
|
||||||
|
flag_size=flag_size,
|
||||||
|
)
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_scale = line_scale
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
|
||||||
self.shift_text = shift_text
|
|
||||||
self.split_text = split_text
|
|
||||||
self.flag_size = flag_size
|
|
||||||
self.strip_text = strip_text
|
|
||||||
self.line_tol = line_tol
|
self.line_tol = line_tol
|
||||||
self.joint_tol = joint_tol
|
self.joint_tol = joint_tol
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.resolution = resolution
|
self.resolution = resolution
|
||||||
|
self.image_path = None
|
||||||
|
self.pdf_image = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
@ -174,51 +164,13 @@ class Lattice(BaseParser):
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
@staticmethod
|
def record_parse_metadata(self, table):
|
||||||
def _copy_spanning_text(t, copy_text=None):
|
"""Record data about the origin of the table
|
||||||
"""Copies over text in empty spanning cells.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : camelot.core.Table
|
|
||||||
copy_text : list, optional (default: None)
|
|
||||||
{'h', 'v'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify the direction in which text should be copied over
|
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : camelot.core.Table
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for f in copy_text:
|
super().record_parse_metadata(table)
|
||||||
if f == "h":
|
# for plotting
|
||||||
for i in range(len(t.cells)):
|
table._image = self.pdf_image # Reuse the image used for calc
|
||||||
for j in range(len(t.cells[i])):
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
if t.cells[i][j].text.strip() == "":
|
|
||||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
|
||||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
|
||||||
elif f == "v":
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
for j in range(len(t.cells[i])):
|
|
||||||
if t.cells[i][j].text.strip() == "":
|
|
||||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
|
||||||
return t
|
|
||||||
|
|
||||||
def _generate_image(self):
|
|
||||||
from ..ext.ghostscript import Ghostscript
|
|
||||||
|
|
||||||
self.imagename = "".join([self.rootname, ".png"])
|
|
||||||
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
|
||||||
self.imagename, self.filename
|
|
||||||
)
|
|
||||||
gs_call = gs_call.encode().split()
|
|
||||||
null = open(os.devnull, "wb")
|
|
||||||
with Ghostscript(*gs_call, stdout=null) as gs:
|
|
||||||
pass
|
|
||||||
null.close()
|
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
|
|
@ -233,20 +185,26 @@ class Lattice(BaseParser):
|
||||||
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||||
return scaled_areas
|
return scaled_areas
|
||||||
|
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.image_path = build_file_path_in_temp_dir(
|
||||||
self.imagename,
|
os.path.basename(self.filename),
|
||||||
|
".png"
|
||||||
|
)
|
||||||
|
export_pdf_as_png(self.filename, self.image_path, self.resolution)
|
||||||
|
self.pdf_image, self.threshold = adaptive_threshold(
|
||||||
|
self.image_path,
|
||||||
process_background=self.process_background,
|
process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize,
|
blocksize=self.threshold_blocksize,
|
||||||
c=self.threshold_constant,
|
c=self.threshold_constant,
|
||||||
)
|
)
|
||||||
|
|
||||||
image_width = self.image.shape[1]
|
image_width = self.pdf_image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.pdf_image.shape[0]
|
||||||
image_width_scaler = image_width / float(self.pdf_width)
|
image_width_scaler = image_width / float(self.pdf_width)
|
||||||
image_height_scaler = image_height / float(self.pdf_height)
|
image_height_scaler = image_height / float(self.pdf_height)
|
||||||
pdf_width_scaler = self.pdf_width / float(image_width)
|
pdf_width_scaler = self.pdf_width / float(image_width)
|
||||||
pdf_height_scaler = self.pdf_height / float(image_height)
|
pdf_height_scaler = self.pdf_height / float(image_height)
|
||||||
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
image_scalers = (image_width_scaler,
|
||||||
|
image_height_scaler, self.pdf_height)
|
||||||
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
||||||
|
|
||||||
if self.table_areas is None:
|
if self.table_areas is None:
|
||||||
|
|
@ -288,46 +246,88 @@ class Lattice(BaseParser):
|
||||||
areas = scale_areas(self.table_areas)
|
areas = scale_areas(self.table_areas)
|
||||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||||
|
|
||||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
[
|
||||||
|
self.table_bbox_parses,
|
||||||
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
self.vertical_segments,
|
||||||
|
self.horizontal_segments
|
||||||
|
] = scale_image(
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
for bbox, parse in self.table_bbox_parses.items():
|
||||||
|
joints = parse["joints"]
|
||||||
|
|
||||||
|
# Merge x coordinates that are close together
|
||||||
|
line_tol = self.line_tol
|
||||||
|
# Sort the joints, make them a list of lists (instead of sets)
|
||||||
|
joints_normalized = list(
|
||||||
|
map(
|
||||||
|
lambda x: list(x),
|
||||||
|
sorted(joints, key=lambda j: - j[0])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for idx in range(1, len(joints_normalized)):
|
||||||
|
x_left, x_right = \
|
||||||
|
joints_normalized[idx-1][0], joints_normalized[idx][0]
|
||||||
|
if x_left - line_tol <= x_right <= x_left + line_tol:
|
||||||
|
joints_normalized[idx][0] = x_left
|
||||||
|
|
||||||
|
# Merge y coordinates that are close together
|
||||||
|
joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
|
||||||
|
for idx in range(1, len(joints_normalized)):
|
||||||
|
y_bottom, y_top = \
|
||||||
|
joints_normalized[idx-1][1], joints_normalized[idx][1]
|
||||||
|
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
|
||||||
|
joints_normalized[idx][1] = y_bottom
|
||||||
|
|
||||||
|
# FRHTODO: check this is useful, otherwise get rid of the code
|
||||||
|
# above
|
||||||
|
parse["joints_normalized"] = joints_normalized
|
||||||
|
|
||||||
|
cols = list(map(lambda coords: coords[0], joints))
|
||||||
|
cols.extend([bbox[0], bbox[2]])
|
||||||
|
rows = list(map(lambda coords: coords[1], joints))
|
||||||
|
rows.extend([bbox[1], bbox[3]])
|
||||||
|
|
||||||
|
# sort horizontal and vertical segments
|
||||||
|
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||||
|
rows = merge_close_lines(
|
||||||
|
sorted(rows, reverse=True),
|
||||||
|
line_tol=self.line_tol
|
||||||
|
)
|
||||||
|
parse["col_anchors"] = cols
|
||||||
|
parse["row_anchors"] = rows
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
tk, self.vertical_segments, self.horizontal_segments
|
bbox, self.vertical_segments, self.horizontal_segments
|
||||||
)
|
)
|
||||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
bbox,
|
||||||
|
self.horizontal_text,
|
||||||
|
self.vertical_text
|
||||||
|
)
|
||||||
|
parse = self.table_bbox_parses[bbox]
|
||||||
|
|
||||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
|
||||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
|
||||||
|
|
||||||
cols, rows = zip(*self.table_bbox[tk])
|
|
||||||
cols, rows = list(cols), list(rows)
|
|
||||||
cols.extend([tk[0], tk[2]])
|
|
||||||
rows.extend([tk[1], tk[3]])
|
|
||||||
# sort horizontal and vertical segments
|
|
||||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
|
||||||
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
|
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
cols = [
|
||||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
(parse["col_anchors"][i], parse["col_anchors"][i + 1])
|
||||||
|
for i in range(0, len(parse["col_anchors"]) - 1)
|
||||||
|
]
|
||||||
|
rows = [
|
||||||
|
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
|
||||||
|
for i in range(0, len(parse["row_anchors"]) - 1)
|
||||||
|
]
|
||||||
return cols, rows, v_s, h_s
|
return cols, rows, v_s, h_s
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
v_s = kwargs.get("v_s")
|
v_s = kwargs.get("v_s")
|
||||||
h_s = kwargs.get("h_s")
|
h_s = kwargs.get("h_s")
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError("No segments found on {}".format(self.rootname))
|
raise ValueError("No segments found on {}".format(self.rootname))
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
|
|
@ -335,81 +335,5 @@ class Lattice(BaseParser):
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_span()
|
table = table.set_span()
|
||||||
|
|
||||||
pos_errors = []
|
self.record_parse_metadata(table)
|
||||||
# TODO: have a single list in place of two directional ones?
|
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
|
||||||
for direction in ["vertical", "horizontal"]:
|
|
||||||
for t in self.t_bbox[direction]:
|
|
||||||
indices, error = get_table_index(
|
|
||||||
table,
|
|
||||||
t,
|
|
||||||
direction,
|
|
||||||
split_text=self.split_text,
|
|
||||||
flag_size=self.flag_size,
|
|
||||||
strip_text=self.strip_text,
|
|
||||||
)
|
|
||||||
if indices[:2] != (-1, -1):
|
|
||||||
pos_errors.append(error)
|
|
||||||
indices = Lattice._reduce_index(
|
|
||||||
table, indices, shift_text=self.shift_text
|
|
||||||
)
|
|
||||||
for r_idx, c_idx, text in indices:
|
|
||||||
table.cells[r_idx][c_idx].text = text
|
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
|
||||||
|
|
||||||
if self.copy_text is not None:
|
|
||||||
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
|
|
||||||
|
|
||||||
data = table.data
|
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "lattice"
|
|
||||||
table.accuracy = accuracy
|
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
|
||||||
_text = []
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
table._text = _text
|
|
||||||
table._image = (self.image, self.table_bbox_unscaled)
|
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
|
||||||
table._textedges = None
|
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
|
||||||
self._generate_layout(filename, layout_kwargs)
|
|
||||||
if not suppress_stdout:
|
|
||||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
|
||||||
|
|
||||||
if not self.horizontal_text:
|
|
||||||
if self.images:
|
|
||||||
warnings.warn(
|
|
||||||
"{} is image-based, camelot only works on"
|
|
||||||
" text-based pages.".format(os.path.basename(self.rootname))
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
|
||||||
)
|
|
||||||
return []
|
|
||||||
|
|
||||||
self._generate_image()
|
|
||||||
self._generate_table_bbox()
|
|
||||||
|
|
||||||
_tables = []
|
|
||||||
# sort tables based on y-coord
|
|
||||||
for table_idx, tk in enumerate(
|
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
|
||||||
):
|
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
|
||||||
table._bbox = tk
|
|
||||||
_tables.append(table)
|
|
||||||
|
|
||||||
return _tables
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,726 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Implementation of network table parser."""
|
||||||
|
|
||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .base import TextBaseParser
|
||||||
|
from ..core import (
|
||||||
|
TextAlignments,
|
||||||
|
ALL_ALIGNMENTS,
|
||||||
|
HORIZONTAL_ALIGNMENTS,
|
||||||
|
VERTICAL_ALIGNMENTS
|
||||||
|
)
|
||||||
|
from ..utils import (
|
||||||
|
bbox_from_str,
|
||||||
|
text_in_bbox,
|
||||||
|
textlines_overlapping_bbox,
|
||||||
|
bbox_from_textlines,
|
||||||
|
find_columns_boundaries,
|
||||||
|
boundaries_to_split_lines,
|
||||||
|
text_in_bbox_per_axis,
|
||||||
|
)
|
||||||
|
|
||||||
|
# maximum number of columns over which a header can spread
|
||||||
|
MAX_COL_SPREAD_IN_HEADER = 3
|
||||||
|
|
||||||
|
# Minimum number of textlines in a table
|
||||||
|
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||||
|
|
||||||
|
|
||||||
|
def column_spread(left, right, col_anchors):
|
||||||
|
"""Get the number of columns crossed by a segment [left, right]."""
|
||||||
|
index_left = 0
|
||||||
|
while index_left < len(col_anchors) \
|
||||||
|
and col_anchors[index_left] < left:
|
||||||
|
index_left += 1
|
||||||
|
index_right = index_left
|
||||||
|
while index_right < len(col_anchors) \
|
||||||
|
and col_anchors[index_right] < right:
|
||||||
|
index_right += 1
|
||||||
|
|
||||||
|
return index_right - index_left
|
||||||
|
|
||||||
|
|
||||||
|
def find_closest_tls(bbox, tls):
|
||||||
|
""" Search for tls that are the closest but outside in all 4 directions
|
||||||
|
"""
|
||||||
|
left, right, top, bottom = None, None, None, None
|
||||||
|
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
|
||||||
|
for textline in tls:
|
||||||
|
if textline.x1 < bbox_left:
|
||||||
|
# Left: check it overlaps horizontally
|
||||||
|
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
|
||||||
|
continue
|
||||||
|
if left is None or left.x1 < textline.x1:
|
||||||
|
left = textline
|
||||||
|
elif bbox_right < textline.x0:
|
||||||
|
# Right: check it overlaps horizontally
|
||||||
|
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
|
||||||
|
continue
|
||||||
|
if right is None or right.x0 > textline.x0:
|
||||||
|
right = textline
|
||||||
|
else:
|
||||||
|
# Either bottom or top: must overlap vertically
|
||||||
|
if textline.x0 > bbox_right or textline.x1 < bbox_left:
|
||||||
|
continue
|
||||||
|
if textline.y1 < bbox_bottom:
|
||||||
|
# Bottom
|
||||||
|
if bottom is None or bottom.y1 < textline.y1:
|
||||||
|
bottom = textline
|
||||||
|
elif bbox_top < textline.y0:
|
||||||
|
# Top
|
||||||
|
if top is None or top.y0 > textline.y0:
|
||||||
|
top = textline
|
||||||
|
return {
|
||||||
|
"left": left,
|
||||||
|
"right": right,
|
||||||
|
"top": top,
|
||||||
|
"bottom": bottom,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
|
"""Expand a bbox vertically up by looking for plausible headers.
|
||||||
|
|
||||||
|
The core algorithm is based on fairly strict alignment of text. It works
|
||||||
|
for the table body, but might fail on tables' headers since they tend to be
|
||||||
|
in a different font, alignment (e.g. vertical), etc.
|
||||||
|
This method evalutes the area above the table body's bbox for
|
||||||
|
characteristics of a table header: close to the top of the body, with cells
|
||||||
|
that fit within the horizontal bounds identified.
|
||||||
|
"""
|
||||||
|
new_bbox = body_bbox
|
||||||
|
(left, bottom, right, top) = body_bbox
|
||||||
|
zones = []
|
||||||
|
|
||||||
|
keep_searching = True
|
||||||
|
while keep_searching:
|
||||||
|
keep_searching = False
|
||||||
|
# a/ first look for the closest text element above the bbox.
|
||||||
|
# It will be the anchor for a possible new row.
|
||||||
|
closest_above = None
|
||||||
|
all_above = []
|
||||||
|
for textline in textlines:
|
||||||
|
# higher than the table, >50% within its bounds
|
||||||
|
textline_center = 0.5 * (textline.x0 + textline.x1)
|
||||||
|
if textline.y0 > top and left < textline_center < right:
|
||||||
|
all_above.append(textline)
|
||||||
|
if closest_above is None or closest_above.y0 > textline.y0:
|
||||||
|
closest_above = textline
|
||||||
|
|
||||||
|
if closest_above and closest_above.y0 < top + max_v_gap:
|
||||||
|
# b/ We have a candidate cell that is within the correct
|
||||||
|
# vertical band, and directly above the table. Starting from
|
||||||
|
# this anchor, we list all the textlines within the same row.
|
||||||
|
tls_in_new_row = []
|
||||||
|
top = closest_above.y1
|
||||||
|
pushed_up = True
|
||||||
|
while pushed_up:
|
||||||
|
pushed_up = False
|
||||||
|
# Iterate and extract elements that fit in the row
|
||||||
|
# from our list
|
||||||
|
for i in range(len(all_above) - 1, -1, -1):
|
||||||
|
textline = all_above[i]
|
||||||
|
if textline.y0 < top:
|
||||||
|
# The bottom of this element is within our row
|
||||||
|
# so we add it.
|
||||||
|
tls_in_new_row.append(textline)
|
||||||
|
all_above.pop(i)
|
||||||
|
if textline.y1 > top:
|
||||||
|
# If the top of this element raises our row's
|
||||||
|
# band, we'll need to keep on searching for
|
||||||
|
# overlapping items
|
||||||
|
top = textline.y1
|
||||||
|
pushed_up = True
|
||||||
|
|
||||||
|
# Get the x-ranges for all the textlines, and merge the
|
||||||
|
# x-ranges that overlap
|
||||||
|
zones = zones + list(
|
||||||
|
map(
|
||||||
|
lambda textline: [textline.x0, textline.x1],
|
||||||
|
tls_in_new_row
|
||||||
|
)
|
||||||
|
)
|
||||||
|
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
|
||||||
|
# Starting from the right, if two zones overlap horizontally,
|
||||||
|
# merge them
|
||||||
|
merged_something = True
|
||||||
|
while merged_something:
|
||||||
|
merged_something = False
|
||||||
|
for i in range(len(zones) - 1, 0, -1):
|
||||||
|
zone_right = zones[i]
|
||||||
|
zone_left = zones[i-1]
|
||||||
|
if zone_left[1] >= zone_right[0]:
|
||||||
|
zone_left[1] = max(zone_right[1], zone_left[1])
|
||||||
|
zones.pop(i)
|
||||||
|
merged_something = True
|
||||||
|
|
||||||
|
max_spread = max(
|
||||||
|
list(
|
||||||
|
map(
|
||||||
|
lambda zone: column_spread(
|
||||||
|
zone[0], zone[1], col_anchors),
|
||||||
|
zones
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Accept textlines that cross columns boundaries, as long as they
|
||||||
|
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
|
||||||
|
# columns.
|
||||||
|
# This is to avoid picking unrelated paragraphs.
|
||||||
|
if max_spread <= min(
|
||||||
|
MAX_COL_SPREAD_IN_HEADER,
|
||||||
|
math.ceil(len(col_anchors) / 2)):
|
||||||
|
# Combined, the elements we've identified don't cross more
|
||||||
|
# than the authorized number of columns.
|
||||||
|
# We're trying to avoid
|
||||||
|
# 0: <BAD: Added header spans too broad>
|
||||||
|
# 1: <A1> <B1> <C1> <D1> <E1>
|
||||||
|
# 2: <A2> <B2> <C2> <D2> <E2>
|
||||||
|
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
|
new_bbox = (left, bottom, right, top)
|
||||||
|
|
||||||
|
# At this stage we've identified a plausible row (or the
|
||||||
|
# beginning of one).
|
||||||
|
keep_searching = True
|
||||||
|
return new_bbox
|
||||||
|
|
||||||
|
|
||||||
|
class AlignmentCounter():
|
||||||
|
"""
|
||||||
|
For a given textline, represent all other textlines aligned with it.
|
||||||
|
|
||||||
|
A textline can be vertically aligned with others if their bbox match on
|
||||||
|
left, right, or middle coord, and horizontally aligned if they match top,
|
||||||
|
bottom, or center coord.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.alignment_to_occurrences = {}
|
||||||
|
for alignment in ALL_ALIGNMENTS:
|
||||||
|
self.alignment_to_occurrences[alignment] = []
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.alignment_to_occurrences[key]
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
self.alignment_to_occurrences[key] = value
|
||||||
|
return value
|
||||||
|
|
||||||
|
def max_alignments(self, alignment_ids=None):
|
||||||
|
"""Get the alignment dimension with the max number of textlines.
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
|
||||||
|
alignment_items = map(
|
||||||
|
lambda alignment_id: (
|
||||||
|
alignment_id,
|
||||||
|
self.alignment_to_occurrences[alignment_id]
|
||||||
|
),
|
||||||
|
alignment_ids
|
||||||
|
)
|
||||||
|
return max(alignment_items, key=lambda item: len(item[1]))
|
||||||
|
|
||||||
|
def max_v(self):
|
||||||
|
"""Tuple (alignment_id, textlines) of largest vertical row.
|
||||||
|
"""
|
||||||
|
# Note that the horizontal alignments (left, center, right) are aligned
|
||||||
|
# vertically in a column, so max_v is calculated by looking at
|
||||||
|
# horizontal alignments.
|
||||||
|
return self.max_alignments(HORIZONTAL_ALIGNMENTS)
|
||||||
|
|
||||||
|
def max_h(self):
|
||||||
|
"""Tuple (alignment_id, textlines) of largest horizontal col.
|
||||||
|
"""
|
||||||
|
return self.max_alignments(VERTICAL_ALIGNMENTS)
|
||||||
|
|
||||||
|
def max_v_count(self):
|
||||||
|
"""Returns the maximum number of alignments along
|
||||||
|
one of the vertical axis (left/right/middle).
|
||||||
|
"""
|
||||||
|
return len(self.max_v()[1])
|
||||||
|
|
||||||
|
def max_h_count(self):
|
||||||
|
"""Returns the maximum number of alignments along
|
||||||
|
one of the horizontal axis (bottom/top/center).
|
||||||
|
"""
|
||||||
|
return len(self.max_h()[1])
|
||||||
|
|
||||||
|
def alignment_score(self):
|
||||||
|
"""We define the alignment score of a textline as the product of the
|
||||||
|
number of aligned elements - 1. The -1 is to avoid favoring
|
||||||
|
singletons on a long line.
|
||||||
|
"""
|
||||||
|
return (self.max_v_count()-1) * (self.max_h_count()-1)
|
||||||
|
|
||||||
|
|
||||||
|
class TextNetworks(TextAlignments):
|
||||||
|
"""Text elements connected by vertical AND horizontal alignments.
|
||||||
|
|
||||||
|
The alignment dict has six keys based on the hor/vert alignments,
|
||||||
|
and each key's value is a list of camelot.core.TextAlignment objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(ALL_ALIGNMENTS)
|
||||||
|
# For each textline, dictionary "alignment type" to
|
||||||
|
# "number of textlines aligned"
|
||||||
|
self._textline_to_alignments = {}
|
||||||
|
|
||||||
|
def _update_alignment(self, alignment, coord, textline):
|
||||||
|
alignment.register_aligned_textline(textline, coord)
|
||||||
|
|
||||||
|
def _register_all_text_lines(self, textlines):
|
||||||
|
"""Add all textlines to our network repository to
|
||||||
|
identify alignments.
|
||||||
|
"""
|
||||||
|
# Identify all the alignments
|
||||||
|
for textline in textlines:
|
||||||
|
if len(textline.get_text().strip()) > 0:
|
||||||
|
self._register_textline(textline)
|
||||||
|
|
||||||
|
def _compute_alignment_counts(self):
|
||||||
|
"""Build a dictionary textline -> alignment object.
|
||||||
|
"""
|
||||||
|
for align_id, textedges in self._text_alignments.items():
|
||||||
|
for textedge in textedges:
|
||||||
|
for textline in textedge.textlines:
|
||||||
|
alignments = self._textline_to_alignments.get(
|
||||||
|
textline, None)
|
||||||
|
if alignments is None:
|
||||||
|
alignments = AlignmentCounter()
|
||||||
|
self._textline_to_alignments[textline] = alignments
|
||||||
|
alignments[align_id] = textedge.textlines
|
||||||
|
|
||||||
|
def remove_unconnected_edges(self):
|
||||||
|
"""Weed out elements which are only connected to others vertically
|
||||||
|
or horizontally. There needs to be connections across both
|
||||||
|
dimensions.
|
||||||
|
"""
|
||||||
|
removed_singletons = True
|
||||||
|
while removed_singletons:
|
||||||
|
removed_singletons = False
|
||||||
|
for text_alignments in self._text_alignments.values():
|
||||||
|
# For each alignment edge, remove items if they are singletons
|
||||||
|
# either horizontally or vertically
|
||||||
|
for text_alignment in text_alignments:
|
||||||
|
for i in range(len(text_alignment.textlines) - 1, -1, -1):
|
||||||
|
textline = text_alignment.textlines[i]
|
||||||
|
alignments = self._textline_to_alignments[textline]
|
||||||
|
if alignments.max_h_count() <= 1 or \
|
||||||
|
alignments.max_v_count() <= 1:
|
||||||
|
del text_alignment.textlines[i]
|
||||||
|
removed_singletons = True
|
||||||
|
self._textline_to_alignments = {}
|
||||||
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
|
def most_connected_textline(self):
|
||||||
|
""" Retrieve the textline that is most connected across vertical and
|
||||||
|
horizontal axis.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Find the textline with the highest alignment score, with a tie break
|
||||||
|
# to prefer textlines further down in the table. Starting the search
|
||||||
|
# from the table's bottom allows the algo to collect data on more cells
|
||||||
|
# before going to the header, typically harder to parse.
|
||||||
|
return max(
|
||||||
|
self._textline_to_alignments.keys(),
|
||||||
|
key=lambda textline:
|
||||||
|
(
|
||||||
|
self._textline_to_alignments[textline].alignment_score(),
|
||||||
|
-textline.y0, -textline.x0
|
||||||
|
),
|
||||||
|
default=None
|
||||||
|
)
|
||||||
|
|
||||||
|
def compute_plausible_gaps(self):
|
||||||
|
""" Evaluate plausible gaps between cells horizontally and vertically
|
||||||
|
based on the textlines aligned with the most connected textline.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
gaps_hv : tuple
|
||||||
|
(horizontal_gap, horizontal_gap) in pdf coordinate space.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Determine the textline that has the most combined
|
||||||
|
# alignments across horizontal and vertical axis.
|
||||||
|
# It will serve as a reference axis along which to collect the average
|
||||||
|
# spacing between rows/cols.
|
||||||
|
most_aligned_tl = self.most_connected_textline()
|
||||||
|
if most_aligned_tl is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Retrieve the list of textlines it's aligned with, across both
|
||||||
|
# axis
|
||||||
|
best_alignment = self._textline_to_alignments[most_aligned_tl]
|
||||||
|
__, ref_h_textlines = best_alignment.max_h()
|
||||||
|
__, ref_v_textlines = best_alignment.max_v()
|
||||||
|
if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
h_textlines = sorted(
|
||||||
|
ref_h_textlines,
|
||||||
|
key=lambda textline: textline.x0,
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
v_textlines = sorted(
|
||||||
|
ref_v_textlines,
|
||||||
|
key=lambda textline: textline.y0,
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
h_gaps, v_gaps = [], []
|
||||||
|
for i in range(1, len(v_textlines)):
|
||||||
|
v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0)
|
||||||
|
for i in range(1, len(h_textlines)):
|
||||||
|
h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0)
|
||||||
|
|
||||||
|
if (not h_gaps or not v_gaps):
|
||||||
|
return None
|
||||||
|
percentile = 75
|
||||||
|
gaps_hv = (
|
||||||
|
2.0 * np.percentile(h_gaps, percentile),
|
||||||
|
2.0 * np.percentile(v_gaps, percentile)
|
||||||
|
)
|
||||||
|
return gaps_hv
|
||||||
|
|
||||||
|
def search_table_body(self, gaps_hv, parse_details=None):
|
||||||
|
""" Build a candidate bbox for the body of a table using network algo
|
||||||
|
|
||||||
|
Seed the process with the textline with the highest alignment
|
||||||
|
score, then expand the bbox with textlines within threshold.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
gaps_hv : tuple
|
||||||
|
The maximum distance allowed to consider surrounding lines/columns
|
||||||
|
as part of the same table.
|
||||||
|
parse_details : array (optional)
|
||||||
|
Optional parameter array, in which to store extra information
|
||||||
|
to help later visualization of the table creation.
|
||||||
|
"""
|
||||||
|
# First, determine the textline that has the most combined
|
||||||
|
# alignments across horizontal and vertical axis.
|
||||||
|
# It will serve both as a starting point for the table boundary
|
||||||
|
# search, and as a way to estimate the average spacing between
|
||||||
|
# rows/cols.
|
||||||
|
most_aligned_tl = self.most_connected_textline()
|
||||||
|
|
||||||
|
# Calculate the 75th percentile of the horizontal/vertical
|
||||||
|
# gaps between textlines. Use this as a reference for a threshold
|
||||||
|
# to not exceed while looking for table boundaries.
|
||||||
|
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
|
||||||
|
|
||||||
|
if parse_details is not None:
|
||||||
|
# Store debug info
|
||||||
|
parse_details_search = {
|
||||||
|
"max_h_gap": max_h_gap,
|
||||||
|
"max_v_gap": max_v_gap,
|
||||||
|
"iterations": []
|
||||||
|
}
|
||||||
|
parse_details.append(parse_details_search)
|
||||||
|
else:
|
||||||
|
parse_details_search = None
|
||||||
|
|
||||||
|
bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
|
||||||
|
most_aligned_tl.x1, most_aligned_tl.y1]
|
||||||
|
|
||||||
|
# For the body of the table, we only consider cells that have
|
||||||
|
# alignments on both axis.
|
||||||
|
tls_search_space = list(self._textline_to_alignments.keys())
|
||||||
|
# tls_search_space = []
|
||||||
|
tls_search_space.remove(most_aligned_tl)
|
||||||
|
tls_in_bbox = [most_aligned_tl]
|
||||||
|
last_bbox = None
|
||||||
|
last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
|
||||||
|
while last_bbox != bbox:
|
||||||
|
if parse_details_search is not None:
|
||||||
|
# Store debug info
|
||||||
|
parse_details_search["iterations"].append(bbox)
|
||||||
|
|
||||||
|
# Check that the closest tls are within the gaps allowed
|
||||||
|
last_bbox = bbox
|
||||||
|
cand_bbox = last_bbox.copy()
|
||||||
|
closest_tls = find_closest_tls(bbox, tls_search_space)
|
||||||
|
for direction, textline in closest_tls.items():
|
||||||
|
if textline is None:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox = cand_bbox.copy()
|
||||||
|
|
||||||
|
if direction == "left":
|
||||||
|
if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[0] = textline.x0
|
||||||
|
elif direction == "right":
|
||||||
|
if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[2] = textline.x1
|
||||||
|
elif direction == "bottom":
|
||||||
|
if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[1] = textline.y0
|
||||||
|
elif direction == "top":
|
||||||
|
if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[3] = textline.y1
|
||||||
|
|
||||||
|
# If they are, see what an expanded bbox in that direction
|
||||||
|
# would contain
|
||||||
|
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
|
||||||
|
tls_in_new_box = new_tls + tls_in_bbox
|
||||||
|
|
||||||
|
# And if we're expanding up or down, check that the addition
|
||||||
|
# of the new row won't reduce the number of columns.
|
||||||
|
# This happens when text covers multiple rows - that's only
|
||||||
|
# allowed in the header, treated separately.
|
||||||
|
cols_bounds = find_columns_boundaries(tls_in_new_box)
|
||||||
|
if direction in ["bottom", "top"] and \
|
||||||
|
len(cols_bounds) < len(last_cols_bounds):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We have an expansion candidate: register it, update the
|
||||||
|
# search space and repeat
|
||||||
|
# We use bbox_from_textlines instead of cand_bbox in case some
|
||||||
|
# overlapping textlines require a large bbox for strict fit.
|
||||||
|
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
||||||
|
last_cols_bounds = cols_bounds
|
||||||
|
tls_in_bbox.extend(new_tls)
|
||||||
|
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||||
|
textline = tls_search_space[i]
|
||||||
|
if textline in new_tls:
|
||||||
|
del tls_search_space[i]
|
||||||
|
|
||||||
|
if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
|
||||||
|
return bbox
|
||||||
|
return None
|
||||||
|
|
||||||
|
def generate(self, textlines):
|
||||||
|
"""Generate the text edge dictionaries based on the
|
||||||
|
input textlines.
|
||||||
|
"""
|
||||||
|
self._register_all_text_lines(textlines)
|
||||||
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
|
|
||||||
|
class Network(TextBaseParser):
|
||||||
|
"""Network method of parsing looks for spaces between text
|
||||||
|
to parse the table.
|
||||||
|
|
||||||
|
If you want to specify columns when specifying multiple table
|
||||||
|
areas, make sure that the length of both lists are equal.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
table_areas : list, optional (default: None)
|
||||||
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
columns : list, optional (default: None)
|
||||||
|
List of column x-coordinates strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Split text that spans across multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Flag text based on font size. Useful to detect
|
||||||
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
edge_tol : int, optional (default: 50)
|
||||||
|
Tolerance parameter for extending textedges vertically.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to combine text vertically,
|
||||||
|
to generate rows.
|
||||||
|
column_tol : int, optional (default: 0)
|
||||||
|
Tolerance parameter used to combine text horizontally,
|
||||||
|
to generate columns.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=None,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(
|
||||||
|
"network",
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
columns=columns,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _generate_table_bbox(self):
|
||||||
|
user_provided_bboxes = None
|
||||||
|
if self.table_areas is not None:
|
||||||
|
# User gave us table areas already. We will use their coordinates
|
||||||
|
# to find column anchors.
|
||||||
|
user_provided_bboxes = []
|
||||||
|
for area_str in self.table_areas:
|
||||||
|
user_provided_bboxes.append(bbox_from_str(area_str))
|
||||||
|
|
||||||
|
# Take all the textlines that are not just spaces
|
||||||
|
all_textlines = [
|
||||||
|
t for t in self.horizontal_text + self.vertical_text
|
||||||
|
if len(t.get_text().strip()) > 0
|
||||||
|
]
|
||||||
|
textlines = self._apply_regions_filter(all_textlines)
|
||||||
|
|
||||||
|
textlines_processed = {}
|
||||||
|
self.table_bbox_parses = {}
|
||||||
|
if self.parse_details is not None:
|
||||||
|
parse_details_network_searches = []
|
||||||
|
self.parse_details["network_searches"] = \
|
||||||
|
parse_details_network_searches
|
||||||
|
parse_details_bbox_searches = []
|
||||||
|
self.parse_details["bbox_searches"] = parse_details_bbox_searches
|
||||||
|
self.parse_details["col_searches"] = []
|
||||||
|
else:
|
||||||
|
parse_details_network_searches = None
|
||||||
|
parse_details_bbox_searches = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Find a bbox: either pulling from the user's or from the network
|
||||||
|
# algorithm.
|
||||||
|
|
||||||
|
# First look for the body of the table
|
||||||
|
bbox_body = None
|
||||||
|
if user_provided_bboxes is not None:
|
||||||
|
if len(user_provided_bboxes) > 0:
|
||||||
|
bbox_body = user_provided_bboxes.pop()
|
||||||
|
else:
|
||||||
|
text_network = TextNetworks()
|
||||||
|
text_network.generate(textlines)
|
||||||
|
text_network.remove_unconnected_edges()
|
||||||
|
gaps_hv = text_network.compute_plausible_gaps()
|
||||||
|
if gaps_hv is None:
|
||||||
|
return None
|
||||||
|
# edge_tol instructions override the calculated vertical gap
|
||||||
|
edge_tol_hv = (
|
||||||
|
gaps_hv[0],
|
||||||
|
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
||||||
|
)
|
||||||
|
bbox_body = text_network.search_table_body(
|
||||||
|
edge_tol_hv,
|
||||||
|
parse_details=parse_details_bbox_searches
|
||||||
|
)
|
||||||
|
|
||||||
|
if parse_details_network_searches is not None:
|
||||||
|
# Preserve the current edge calculation for debugging
|
||||||
|
parse_details_network_searches.append(
|
||||||
|
copy.deepcopy(text_network)
|
||||||
|
)
|
||||||
|
|
||||||
|
if bbox_body is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Get all the textlines that overlap with the box, compute
|
||||||
|
# columns
|
||||||
|
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
|
||||||
|
cols_boundaries = find_columns_boundaries(tls_in_bbox)
|
||||||
|
cols_anchors = boundaries_to_split_lines(cols_boundaries)
|
||||||
|
|
||||||
|
# Unless the user gave us strict bbox_body, try to find a header
|
||||||
|
# above the body to build the full bbox.
|
||||||
|
if user_provided_bboxes is not None:
|
||||||
|
bbox_full = bbox_body
|
||||||
|
else:
|
||||||
|
# Expand the text box to fully contain the tls we found
|
||||||
|
bbox_body = bbox_from_textlines(tls_in_bbox)
|
||||||
|
|
||||||
|
# Apply a heuristic to salvage headers which formatting might
|
||||||
|
# be off compared to the rest of the table.
|
||||||
|
bbox_full = search_header_from_body_bbox(
|
||||||
|
bbox_body,
|
||||||
|
textlines,
|
||||||
|
cols_anchors,
|
||||||
|
gaps_hv[1]
|
||||||
|
)
|
||||||
|
|
||||||
|
table_parse = {
|
||||||
|
"bbox_body": bbox_body,
|
||||||
|
"cols_boundaries": cols_boundaries,
|
||||||
|
"cols_anchors": cols_anchors,
|
||||||
|
"bbox_full": bbox_full
|
||||||
|
}
|
||||||
|
self.table_bbox_parses[bbox_full] = table_parse
|
||||||
|
|
||||||
|
if self.parse_details is not None:
|
||||||
|
self.parse_details["col_searches"].append(table_parse)
|
||||||
|
|
||||||
|
# Remember what textlines we processed, and repeat
|
||||||
|
for textline in tls_in_bbox:
|
||||||
|
textlines_processed[textline] = None
|
||||||
|
textlines = list(filter(
|
||||||
|
lambda textline: textline not in textlines_processed,
|
||||||
|
textlines
|
||||||
|
))
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
|
# select elements which lie within table_bbox
|
||||||
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
|
bbox,
|
||||||
|
self.horizontal_text,
|
||||||
|
self.vertical_text
|
||||||
|
)
|
||||||
|
|
||||||
|
all_tls = list(
|
||||||
|
sorted(
|
||||||
|
filter(
|
||||||
|
lambda textline: len(textline.get_text().strip()) > 0,
|
||||||
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
|
),
|
||||||
|
key=lambda textline: (-textline.y0, textline.x0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
|
all_tls
|
||||||
|
)
|
||||||
|
# FRHTODO:
|
||||||
|
# This algorithm takes the horizontal textlines in the bbox, and groups
|
||||||
|
# them into rows based on their bottom y0.
|
||||||
|
# That's wrong: it misses the vertical items, and misses out on all
|
||||||
|
# the alignment identification work we've done earlier.
|
||||||
|
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
||||||
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
|
|
||||||
|
if user_cols is not None:
|
||||||
|
cols = [text_x_min] + user_cols + [text_x_max]
|
||||||
|
cols = [
|
||||||
|
(cols[i], cols[i + 1])
|
||||||
|
for i in range(0, len(cols) - 1)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
parse_details = self.table_bbox_parses[bbox]
|
||||||
|
col_anchors = parse_details["cols_anchors"]
|
||||||
|
cols = list(map(
|
||||||
|
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
|
||||||
|
range(0, len(col_anchors) - 1)
|
||||||
|
))
|
||||||
|
|
||||||
|
return cols, rows, None, None
|
||||||
|
|
@ -1,21 +1,18 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
from .base import TextBaseParser
|
||||||
import pandas as pd
|
from ..core import TextEdges
|
||||||
|
from ..utils import (
|
||||||
from .base import BaseParser
|
bbox_from_str,
|
||||||
from ..core import TextEdges, Table
|
bbox_from_textlines,
|
||||||
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
|
text_in_bbox,
|
||||||
|
text_in_bbox_per_axis,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
class Stream(TextBaseParser):
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
|
||||||
"""Stream method of parsing looks for spaces between text
|
"""Stream method of parsing looks for spaces between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
||||||
|
|
@ -55,218 +52,35 @@ class Stream(BaseParser):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
columns=None,
|
columns=None,
|
||||||
split_text=False,
|
flag_size=False,
|
||||||
flag_size=False,
|
split_text=False,
|
||||||
strip_text="",
|
strip_text="",
|
||||||
edge_tol=50,
|
edge_tol=50,
|
||||||
row_tol=2,
|
row_tol=2,
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
**kwargs
|
**kwargs):
|
||||||
):
|
super().__init__(
|
||||||
self.table_regions = table_regions
|
"stream",
|
||||||
self.table_areas = table_areas
|
table_regions=table_regions,
|
||||||
self.columns = columns
|
table_areas=table_areas,
|
||||||
self._validate_columns()
|
columns=columns,
|
||||||
self.split_text = split_text
|
flag_size=flag_size,
|
||||||
self.flag_size = flag_size
|
split_text=split_text,
|
||||||
self.strip_text = strip_text
|
strip_text=strip_text,
|
||||||
self.edge_tol = edge_tol
|
edge_tol=edge_tol,
|
||||||
self.row_tol = row_tol
|
row_tol=row_tol,
|
||||||
self.column_tol = column_tol
|
column_tol=column_tol,
|
||||||
|
)
|
||||||
@staticmethod
|
self.textedges = []
|
||||||
def _text_bbox(t_bbox):
|
|
||||||
"""Returns bounding box for the text present on a page.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_bbox : dict
|
|
||||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
|
||||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text_bbox : tuple
|
|
||||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
|
||||||
|
|
||||||
"""
|
|
||||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
text_bbox = (xmin, ymin, xmax, ymax)
|
|
||||||
return text_bbox
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _group_rows(text, row_tol=2):
|
|
||||||
"""Groups PDFMiner text objects into rows vertically
|
|
||||||
within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
row_tol : int, optional (default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_y = 0
|
|
||||||
rows = []
|
|
||||||
temp = []
|
|
||||||
for t in text:
|
|
||||||
# is checking for upright necessary?
|
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
|
||||||
# type(obj) is LTChar]):
|
|
||||||
if t.get_text().strip():
|
|
||||||
if not np.isclose(row_y, t.y0, atol=row_tol):
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
temp = []
|
|
||||||
row_y = t.y0
|
|
||||||
temp.append(t)
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
__ = rows.pop(0) # TODO: hacky
|
|
||||||
return rows
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _merge_columns(l, column_tol=0):
|
|
||||||
"""Merges column boundaries horizontally if they overlap
|
|
||||||
or lie within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
l : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
column_tol : int, optional (default: 0)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
merged : list
|
|
||||||
List of merged column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
merged = []
|
|
||||||
for higher in l:
|
|
||||||
if not merged:
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
lower = merged[-1]
|
|
||||||
if column_tol >= 0:
|
|
||||||
if higher[0] <= lower[1] or np.isclose(
|
|
||||||
higher[0], lower[1], atol=column_tol
|
|
||||||
):
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
elif column_tol < 0:
|
|
||||||
if higher[0] <= lower[1]:
|
|
||||||
if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
return merged
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
|
||||||
"""Makes row coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
rows_grouped : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
text_y_max : int
|
|
||||||
text_y_min : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
List of continuous row y-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_mids = [
|
|
||||||
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
|
|
||||||
for r in rows_grouped
|
|
||||||
]
|
|
||||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
|
||||||
rows.insert(0, text_y_max)
|
|
||||||
rows.append(text_y_min)
|
|
||||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
|
||||||
return rows
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _add_columns(cols, text, row_tol):
|
|
||||||
"""Adds columns to existing list by taking into account
|
|
||||||
the text that lies outside the current column x-coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
ytol : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if text:
|
|
||||||
text = Stream._group_rows(text, row_tol=row_tol)
|
|
||||||
elements = [len(r) for r in text]
|
|
||||||
new_cols = [
|
|
||||||
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
|
|
||||||
]
|
|
||||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
|
||||||
return cols
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _join_columns(cols, text_x_min, text_x_max):
|
|
||||||
"""Makes column coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text_x_min : int
|
|
||||||
text_y_max : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
cols = sorted(cols)
|
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
return cols
|
|
||||||
|
|
||||||
def _validate_columns(self):
|
|
||||||
if self.table_areas is not None and self.columns is not None:
|
|
||||||
if len(self.table_areas) != len(self.columns):
|
|
||||||
raise ValueError("Length of table_areas and columns" " should be equal")
|
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
"""A general implementation of the table detection algorithm
|
"""A general implementation of the table detection algorithm
|
||||||
described by Anssi Nurminen's master's thesis.
|
described by Anssi Nurminen's master's thesis.
|
||||||
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
|
||||||
|
|
||||||
Assumes that tables are situated relatively far apart
|
Assumes that tables are situated relatively far apart
|
||||||
vertically.
|
vertically.
|
||||||
|
|
@ -283,65 +97,59 @@ class Stream(BaseParser):
|
||||||
# guess table areas using textlines and relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||||
# treat whole page as table area if no table areas found
|
# treat whole page as table area if no table areas found
|
||||||
if not len(table_bbox):
|
if not table_bbox:
|
||||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||||
|
|
||||||
return table_bbox
|
return table_bbox
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
table._textedges = self.textedges
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
self.textedges = []
|
|
||||||
if self.table_areas is None:
|
if self.table_areas is None:
|
||||||
hor_text = self.horizontal_text
|
hor_text = self.horizontal_text
|
||||||
if self.table_regions is not None:
|
if self.table_regions is not None:
|
||||||
# filter horizontal text
|
# filter horizontal text
|
||||||
hor_text = []
|
hor_text = []
|
||||||
for region in self.table_regions:
|
for region_str in self.table_regions:
|
||||||
x1, y1, x2, y2 = region.split(",")
|
region_text = text_in_bbox(
|
||||||
x1 = float(x1)
|
bbox_from_str(region_str),
|
||||||
y1 = float(y1)
|
self.horizontal_text)
|
||||||
x2 = float(x2)
|
|
||||||
y2 = float(y2)
|
|
||||||
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
|
|
||||||
hor_text.extend(region_text)
|
hor_text.extend(region_text)
|
||||||
# find tables based on nurminen's detection algorithm
|
# find tables based on nurminen's detection algorithm
|
||||||
table_bbox = self._nurminen_table_detection(hor_text)
|
table_bbox_parses = self._nurminen_table_detection(hor_text)
|
||||||
else:
|
else:
|
||||||
table_bbox = {}
|
table_bbox_parses = {}
|
||||||
for area in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
table_bbox_parses[bbox_from_str(area_str)] = None
|
||||||
x1 = float(x1)
|
self.table_bbox_parses = table_bbox_parses
|
||||||
y1 = float(y1)
|
|
||||||
x2 = float(x2)
|
|
||||||
y2 = float(y2)
|
|
||||||
table_bbox[(x1, y2, x2, y1)] = None
|
|
||||||
self.table_bbox = table_bbox
|
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
bbox,
|
||||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
self.horizontal_text,
|
||||||
|
self.vertical_text
|
||||||
|
)
|
||||||
|
|
||||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
|
)
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
rows_grouped = self._group_rows(
|
||||||
|
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
|
||||||
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
|
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
if user_cols is not None:
|
||||||
# user has to input boundary columns too
|
cols = [text_x_min] + user_cols + [text_x_max]
|
||||||
# take (0, pdf_width) by default
|
cols = [
|
||||||
# similar to else condition
|
(cols[i], cols[i + 1])
|
||||||
# len can't be 1
|
for i in range(0, len(cols) - 1)
|
||||||
cols = self.columns[table_idx].split(",")
|
]
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
else:
|
||||||
# calculate mode of the list of number of elements in
|
# calculate mode of the list of number of elements in
|
||||||
# each row to guess the number of columns
|
# each row to guess the number of columns
|
||||||
|
|
@ -353,14 +161,22 @@ class Stream(BaseParser):
|
||||||
# see if the list contains elements, if yes, then use
|
# see if the list contains elements, if yes, then use
|
||||||
# the mode after removing 1s
|
# the mode after removing 1s
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
elements = list(filter(lambda x: x != 1, elements))
|
||||||
if len(elements):
|
if elements:
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"No tables found in table area {table_idx + 1}"
|
f"No tables found in table area {bbox}"
|
||||||
)
|
)
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [
|
||||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
(t.x0, t.x1)
|
||||||
|
for r in rows_grouped
|
||||||
|
if len(r) == ncols
|
||||||
|
for t in r
|
||||||
|
]
|
||||||
|
cols = self._merge_columns(
|
||||||
|
sorted(cols),
|
||||||
|
column_tol=self.column_tol
|
||||||
|
)
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
|
|
@ -383,80 +199,4 @@ class Stream(BaseParser):
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
return cols, rows
|
return cols, rows, None, None
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
|
||||||
table = Table(cols, rows)
|
|
||||||
table = table.set_all_edges()
|
|
||||||
|
|
||||||
pos_errors = []
|
|
||||||
# TODO: have a single list in place of two directional ones?
|
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
|
||||||
for direction in ["vertical", "horizontal"]:
|
|
||||||
for t in self.t_bbox[direction]:
|
|
||||||
indices, error = get_table_index(
|
|
||||||
table,
|
|
||||||
t,
|
|
||||||
direction,
|
|
||||||
split_text=self.split_text,
|
|
||||||
flag_size=self.flag_size,
|
|
||||||
strip_text=self.strip_text,
|
|
||||||
)
|
|
||||||
if indices[:2] != (-1, -1):
|
|
||||||
pos_errors.append(error)
|
|
||||||
for r_idx, c_idx, text in indices:
|
|
||||||
table.cells[r_idx][c_idx].text = text
|
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
|
||||||
|
|
||||||
data = table.data
|
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "stream"
|
|
||||||
table.accuracy = accuracy
|
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
|
||||||
_text = []
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
table._text = _text
|
|
||||||
table._image = None
|
|
||||||
table._segments = None
|
|
||||||
table._textedges = self.textedges
|
|
||||||
|
|
||||||
return table
|
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
|
||||||
self._generate_layout(filename, layout_kwargs)
|
|
||||||
base_filename = os.path.basename(self.rootname)
|
|
||||||
|
|
||||||
if not suppress_stdout:
|
|
||||||
logger.info(f"Processing {base_filename}")
|
|
||||||
|
|
||||||
if not self.horizontal_text:
|
|
||||||
if self.images:
|
|
||||||
warnings.warn(
|
|
||||||
f"{base_filename} is image-based, camelot only works on"
|
|
||||||
" text-based pages."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
warnings.warn(f"No tables found on {base_filename}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
self._generate_table_bbox()
|
|
||||||
|
|
||||||
_tables = []
|
|
||||||
# sort tables based on y-coord
|
|
||||||
for table_idx, tk in enumerate(
|
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
|
||||||
):
|
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
|
||||||
table._bbox = tk
|
|
||||||
_tables.append(table)
|
|
||||||
|
|
||||||
return _tables
|
|
||||||
|
|
|
||||||
|
|
@ -8,9 +8,164 @@ except ImportError:
|
||||||
else:
|
else:
|
||||||
_HAS_MPL = True
|
_HAS_MPL = True
|
||||||
|
|
||||||
|
from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
|
||||||
|
|
||||||
class PlotMethods(object):
|
from pdfminer.layout import (
|
||||||
def __call__(self, table, kind="text", filename=None):
|
LTTextLineVertical,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extend_axe_lim(ax, bbox, margin=10):
|
||||||
|
"""Ensure the ax limits include the input bbox
|
||||||
|
"""
|
||||||
|
x0, x1 = ax.get_xlim()
|
||||||
|
y0, y1 = ax.get_ylim()
|
||||||
|
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
|
||||||
|
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
|
||||||
|
|
||||||
|
|
||||||
|
def draw_labeled_bbox(
|
||||||
|
ax, bbox, text,
|
||||||
|
color="black", linewidth=3,
|
||||||
|
linestyle="solid",
|
||||||
|
label_pos="top,left",
|
||||||
|
fontsize=12,
|
||||||
|
):
|
||||||
|
"""Utility drawing function to draw a box with an associated text label
|
||||||
|
"""
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(bbox[0], bbox[1]),
|
||||||
|
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
||||||
|
color=color,
|
||||||
|
linewidth=linewidth, linestyle=linestyle,
|
||||||
|
fill=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
vlabel, hlabel = label_pos.split(",")
|
||||||
|
if vlabel == "top":
|
||||||
|
y = max(bbox[1], bbox[3])
|
||||||
|
elif vlabel == "bottom":
|
||||||
|
y = min(bbox[1], bbox[3])
|
||||||
|
else:
|
||||||
|
y = 0.5 * (bbox[1] + bbox[3])
|
||||||
|
|
||||||
|
# We want to draw the label outside the box (above or below)
|
||||||
|
label_align_swap = {
|
||||||
|
"top": "bottom",
|
||||||
|
"bottom": "top",
|
||||||
|
"center": "center"
|
||||||
|
}
|
||||||
|
vlabel_out_of_box = label_align_swap[vlabel]
|
||||||
|
if hlabel == "right":
|
||||||
|
x = max(bbox[0], bbox[2])
|
||||||
|
elif hlabel == "left":
|
||||||
|
x = min(bbox[0], bbox[2])
|
||||||
|
else:
|
||||||
|
x = 0.5 * (bbox[0] + bbox[2])
|
||||||
|
ax.text(
|
||||||
|
x, y,
|
||||||
|
text,
|
||||||
|
fontsize=fontsize, color="black",
|
||||||
|
verticalalignment=vlabel_out_of_box,
|
||||||
|
horizontalalignment=hlabel,
|
||||||
|
bbox=dict(facecolor=color, alpha=0.1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_pdf(table, ax):
|
||||||
|
"""Draw the content of the table's source pdf into the passed subplot
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
|
"""
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
|
|
||||||
|
|
||||||
|
def draw_parse_constraints(table, ax):
|
||||||
|
"""Draw any user provided constraints (area, region, columns, etc)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if table.parse_details:
|
||||||
|
zone_constraints = {
|
||||||
|
"region": "table_regions",
|
||||||
|
"area": "table_areas",
|
||||||
|
}
|
||||||
|
for zone_name, zone_id in zone_constraints.items():
|
||||||
|
# Display a bbox per region / area
|
||||||
|
for zone_str in table.parse_details[zone_id] or []:
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, bbox_from_str(zone_str),
|
||||||
|
"{zone_name}: ({zone_str})".format(
|
||||||
|
zone_name=zone_name,
|
||||||
|
zone_str=zone_str
|
||||||
|
),
|
||||||
|
color="purple",
|
||||||
|
linestyle="dotted",
|
||||||
|
linewidth=1,
|
||||||
|
label_pos="bottom,right"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_text(table, ax):
|
||||||
|
"""Draw text, horizontal in blue, vertical in red
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
|
"""
|
||||||
|
bbox = bbox_from_textlines(table.textlines)
|
||||||
|
for t in table.textlines:
|
||||||
|
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t.x0, t.y0),
|
||||||
|
t.x1 - t.x0,
|
||||||
|
t.y1 - t.y0,
|
||||||
|
color=color,
|
||||||
|
alpha=0.2
|
||||||
|
)
|
||||||
|
)
|
||||||
|
extend_axe_lim(ax, bbox)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_plot(table, ax=None):
|
||||||
|
"""Initialize plot and draw common components
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ax : matplotlib.axes.Axes
|
||||||
|
"""
|
||||||
|
if ax is None:
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
draw_pdf(table, ax)
|
||||||
|
draw_parse_constraints(table, ax)
|
||||||
|
return ax
|
||||||
|
|
||||||
|
|
||||||
|
class PlotMethods():
|
||||||
|
def __call__(self, table, kind="text", filename=None, ax=None):
|
||||||
"""Plot elements found on PDF page based on kind
|
"""Plot elements found on PDF page based on kind
|
||||||
specified, useful for debugging and playing with different
|
specified, useful for debugging and playing with different
|
||||||
parameters to get the best output.
|
parameters to get the best output.
|
||||||
|
|
@ -20,7 +175,8 @@ class PlotMethods(object):
|
||||||
table: camelot.core.Table
|
table: camelot.core.Table
|
||||||
A Camelot Table.
|
A Camelot Table.
|
||||||
kind : str, optional (default: 'text')
|
kind : str, optional (default: 'text')
|
||||||
{'text', 'grid', 'contour', 'joint', 'line'}
|
{'text', 'grid', 'contour', 'joint', 'line',
|
||||||
|
'network_table_search'}
|
||||||
The element type for which a plot should be generated.
|
The element type for which a plot should be generated.
|
||||||
filepath: str, optional (default: None)
|
filepath: str, optional (default: None)
|
||||||
Absolute path for saving the generated plot.
|
Absolute path for saving the generated plot.
|
||||||
|
|
@ -37,53 +193,49 @@ class PlotMethods(object):
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Lattice flavor does not support kind='{kind}'"
|
f"Lattice flavor does not support kind='{kind}'"
|
||||||
)
|
)
|
||||||
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
if table.flavor != "lattice" and kind in ["line"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Stream flavor does not support kind='{kind}'"
|
f"{table.flavor} flavor does not support kind='{kind}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_method = getattr(self, kind)
|
plot_method = getattr(self, kind)
|
||||||
return plot_method(table)
|
return plot_method(table, ax)
|
||||||
|
|
||||||
def text(self, table):
|
@staticmethod
|
||||||
|
def text(table, ax=None):
|
||||||
"""Generates a plot for all text elements present
|
"""Generates a plot for all text elements present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
draw_text(table, ax)
|
||||||
xs, ys = [], []
|
return ax.get_figure()
|
||||||
for t in table._text:
|
|
||||||
xs.extend([t[0], t[2]])
|
|
||||||
ys.extend([t[1], t[3]])
|
|
||||||
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
|
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
||||||
return fig
|
|
||||||
|
|
||||||
def grid(self, table):
|
@staticmethod
|
||||||
|
def grid(table, ax=None):
|
||||||
"""Generates a plot for the detected table grids
|
"""Generates a plot for the detected table grids
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
for row in table.cells:
|
for row in table.cells:
|
||||||
for cell in row:
|
for cell in row:
|
||||||
if cell.left:
|
if cell.left:
|
||||||
|
|
@ -94,130 +246,247 @@ class PlotMethods(object):
|
||||||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||||
if cell.bottom:
|
if cell.bottom:
|
||||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
def contour(self, table):
|
@staticmethod
|
||||||
|
def contour(table, ax=None):
|
||||||
"""Generates a plot for all table boundaries present
|
"""Generates a plot for all table boundaries present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
img, table_bbox = table._image
|
ax = prepare_plot(table, ax)
|
||||||
_FOR_LATTICE = True
|
|
||||||
except TypeError:
|
|
||||||
img, table_bbox = (None, {table._bbox: None})
|
|
||||||
_FOR_LATTICE = False
|
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
|
|
||||||
xs, ys = [], []
|
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
for t in table._text:
|
draw_text(table, ax)
|
||||||
xs.extend([t[0], t[2]])
|
|
||||||
ys.extend([t[1], t[3]])
|
|
||||||
ax.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
for t in table_bbox.keys():
|
ax.add_patch(
|
||||||
ax.add_patch(
|
patches.Rectangle(
|
||||||
patches.Rectangle(
|
(table._bbox[0], table._bbox[1]),
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
|
table._bbox[2] - table._bbox[0],
|
||||||
)
|
table._bbox[3] - table._bbox[1],
|
||||||
|
fill=False, color="red"
|
||||||
)
|
)
|
||||||
if not _FOR_LATTICE:
|
)
|
||||||
xs.extend([t[0], t[2]])
|
if not _FOR_LATTICE:
|
||||||
ys.extend([t[1], t[3]])
|
extend_axe_lim(ax, table._bbox)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
||||||
|
|
||||||
if _FOR_LATTICE:
|
return ax.get_figure()
|
||||||
ax.imshow(img)
|
|
||||||
return fig
|
|
||||||
|
|
||||||
def textedge(self, table):
|
@staticmethod
|
||||||
|
def textedge(table, ax=None):
|
||||||
"""Generates a plot for relevant textedges.
|
"""Generates a plot for relevant textedges.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
draw_text(table, ax)
|
||||||
xs, ys = [], []
|
|
||||||
for t in table._text:
|
|
||||||
xs.extend([t[0], t[2]])
|
|
||||||
ys.extend([t[1], t[3]])
|
|
||||||
ax.add_patch(
|
|
||||||
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
|
|
||||||
)
|
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
||||||
|
|
||||||
for te in table._textedges:
|
if table.flavor == "network":
|
||||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
for network in table.parse_details["network_searches"]:
|
||||||
|
most_connected_tl = network.most_connected_textline()
|
||||||
|
|
||||||
return fig
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(most_connected_tl.x0, most_connected_tl.y0),
|
||||||
|
most_connected_tl.x1 - most_connected_tl.x0,
|
||||||
|
most_connected_tl.y1 - most_connected_tl.y0,
|
||||||
|
color="red",
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for tl in sorted(
|
||||||
|
network._textline_to_alignments.keys(),
|
||||||
|
key=lambda textline: (-textline.y0, textline.x0)
|
||||||
|
):
|
||||||
|
alignments = network._textline_to_alignments[tl]
|
||||||
|
coords = get_textline_coords(tl)
|
||||||
|
alignment_id_h, tls_h = alignments.max_v()
|
||||||
|
alignment_id_v, tls_v = alignments.max_h()
|
||||||
|
xs = list(map(lambda tl: tl.x0, tls_v))
|
||||||
|
ys = list(map(lambda tl: tl.y1, tls_h))
|
||||||
|
top_h = max(ys)
|
||||||
|
ax.text(
|
||||||
|
coords[alignment_id_h],
|
||||||
|
top_h + 5,
|
||||||
|
"{max_h_count}".format(max_h_count=len(tls_h)),
|
||||||
|
verticalalignment="bottom",
|
||||||
|
horizontalalignment="center",
|
||||||
|
fontsize=8,
|
||||||
|
color="green"
|
||||||
|
)
|
||||||
|
ax.plot(
|
||||||
|
[coords[alignment_id_h]] * len(ys), ys,
|
||||||
|
color="green",
|
||||||
|
linestyle="solid",
|
||||||
|
linewidth=1,
|
||||||
|
marker="o",
|
||||||
|
markersize=3
|
||||||
|
)
|
||||||
|
|
||||||
def joint(self, table):
|
left_v = min(map(lambda tl: tl.x0, tls_v))
|
||||||
|
ax.text(
|
||||||
|
left_v - 5,
|
||||||
|
coords[alignment_id_v],
|
||||||
|
"{max_v_count}".format(max_v_count=len(tls_v)),
|
||||||
|
verticalalignment="center",
|
||||||
|
horizontalalignment="right",
|
||||||
|
fontsize=8,
|
||||||
|
color="blue"
|
||||||
|
)
|
||||||
|
ax.plot(
|
||||||
|
xs, [coords[alignment_id_v]] * len(xs),
|
||||||
|
color="blue",
|
||||||
|
linestyle="solid",
|
||||||
|
linewidth=1,
|
||||||
|
marker="o",
|
||||||
|
markersize=3
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
for te in table._textedges:
|
||||||
|
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
||||||
|
return ax.get_figure()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def joint(table, ax=None):
|
||||||
"""Generates a plot for all line intersections present
|
"""Generates a plot for all line intersections present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img, table_bbox = table._image
|
ax = prepare_plot(table, ax)
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
for k in table_bbox.keys():
|
for coord in table.parse["joints"]:
|
||||||
for coord in table_bbox[k]:
|
x_coord.append(coord[0])
|
||||||
x_coord.append(coord[0])
|
y_coord.append(coord[1])
|
||||||
y_coord.append(coord[1])
|
|
||||||
ax.plot(x_coord, y_coord, "ro")
|
ax.plot(x_coord, y_coord, "ro")
|
||||||
ax.imshow(img)
|
return ax.get_figure()
|
||||||
return fig
|
|
||||||
|
|
||||||
def line(self, table):
|
@staticmethod
|
||||||
|
def line(table, ax=None):
|
||||||
"""Generates a plot for all line segments present
|
"""Generates a plot for all line segments present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
vertical, horizontal = table._segments
|
vertical, horizontal = table._segments
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def network_table_search(table, ax=None):
|
||||||
|
"""Generates a plot illustrating the steps of the network table search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
|
"""
|
||||||
|
ax = prepare_plot(table, ax)
|
||||||
|
if table.parse_details is None:
|
||||||
|
return ax.get_figure()
|
||||||
|
parse_details = table.parse_details
|
||||||
|
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
||||||
|
max_h_gap = bbox_search["max_h_gap"]
|
||||||
|
max_v_gap = bbox_search["max_v_gap"]
|
||||||
|
iterations = bbox_search["iterations"]
|
||||||
|
for iteration, bbox in enumerate(iterations):
|
||||||
|
final = iteration == len(iterations) - 1
|
||||||
|
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, bbox,
|
||||||
|
"t{box_id}/i{iteration}".format(
|
||||||
|
box_id=box_id,
|
||||||
|
iteration=iteration
|
||||||
|
),
|
||||||
|
color="red",
|
||||||
|
linewidth=5 if final else 2,
|
||||||
|
fontsize=12 if final else 8,
|
||||||
|
label_pos="bottom,left"
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
|
||||||
|
bbox[2] - bbox[0] + 2 * max_h_gap,
|
||||||
|
bbox[3] - bbox[1] + 2 * max_v_gap,
|
||||||
|
color="orange",
|
||||||
|
fill=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for box_id, col_search in enumerate(parse_details["col_searches"]):
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, col_search["bbox_full"],
|
||||||
|
"box body + header #{box_id}".format(
|
||||||
|
box_id=box_id
|
||||||
|
),
|
||||||
|
color="red",
|
||||||
|
linewidth=4,
|
||||||
|
label_pos="top,left"
|
||||||
|
)
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, col_search["bbox_body"],
|
||||||
|
"box body #{box_id}".format(
|
||||||
|
box_id=box_id
|
||||||
|
),
|
||||||
|
color="orange",
|
||||||
|
linewidth=2,
|
||||||
|
label_pos="bottom,left"
|
||||||
|
)
|
||||||
|
for col_anchor in col_search["cols_anchors"]:
|
||||||
|
# Display a green line at the col boundary line throughout the
|
||||||
|
# table bbox.
|
||||||
|
ax.plot(
|
||||||
|
[col_anchor, col_anchor],
|
||||||
|
[
|
||||||
|
col_search["bbox_body"][1] - 10,
|
||||||
|
col_search["bbox_body"][3] + 10,
|
||||||
|
],
|
||||||
|
color="green"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ax.get_figure()
|
||||||
|
|
|
||||||
617
camelot/utils.py
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import atexit
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
|
|
@ -9,8 +10,10 @@ import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
from urllib.request import Request
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
@ -27,7 +30,9 @@ from pdfminer.layout import (
|
||||||
LTImage,
|
LTImage,
|
||||||
)
|
)
|
||||||
|
|
||||||
from urllib.request import Request, urlopen
|
from .ext.ghostscript import Ghostscript
|
||||||
|
|
||||||
|
from urllib.request import urlopen
|
||||||
from urllib.parse import urlparse as parse_url
|
from urllib.parse import urlparse as parse_url
|
||||||
from urllib.parse import uses_relative, uses_netloc, uses_params
|
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
|
|
||||||
|
|
@ -93,8 +98,21 @@ def download_url(url):
|
||||||
return filepath
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
common_kwargs = [
|
||||||
lattice_kwargs = [
|
"flag_size",
|
||||||
|
"margins",
|
||||||
|
"split_text",
|
||||||
|
"strip_text",
|
||||||
|
"table_areas",
|
||||||
|
"table_regions"
|
||||||
|
]
|
||||||
|
text_kwargs = common_kwargs + [
|
||||||
|
"columns",
|
||||||
|
"edge_tol",
|
||||||
|
"row_tol",
|
||||||
|
"column_tol"
|
||||||
|
]
|
||||||
|
lattice_kwargs = common_kwargs + [
|
||||||
"process_background",
|
"process_background",
|
||||||
"line_scale",
|
"line_scale",
|
||||||
"copy_text",
|
"copy_text",
|
||||||
|
|
@ -106,42 +124,72 @@ lattice_kwargs = [
|
||||||
"iterations",
|
"iterations",
|
||||||
"resolution",
|
"resolution",
|
||||||
]
|
]
|
||||||
|
flavor_to_kwargs = {
|
||||||
|
"stream": text_kwargs,
|
||||||
|
"network": text_kwargs,
|
||||||
|
"lattice": lattice_kwargs,
|
||||||
|
"hybrid": text_kwargs + lattice_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, flavor="lattice"):
|
def validate_input(kwargs, flavor="lattice"):
|
||||||
def check_intersection(parser_kwargs, input_kwargs):
|
parser_kwargs = flavor_to_kwargs[flavor]
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
# s.difference(t): new set with elements in s but not in t
|
||||||
if isec:
|
isec = set(kwargs.keys()).difference(set(parser_kwargs))
|
||||||
raise ValueError(
|
if isec:
|
||||||
f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
|
raise ValueError(
|
||||||
)
|
f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
|
||||||
|
)
|
||||||
if flavor == "lattice":
|
|
||||||
check_intersection(stream_kwargs, kwargs)
|
|
||||||
else:
|
|
||||||
check_intersection(lattice_kwargs, kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, flavor="lattice"):
|
def remove_extra(kwargs, flavor="lattice"):
|
||||||
if flavor == "lattice":
|
parser_kwargs = flavor_to_kwargs[flavor]
|
||||||
for key in kwargs.keys():
|
# Avoid "dictionary changed size during iteration"
|
||||||
if key in stream_kwargs:
|
kwargs_keys = list(kwargs.keys())
|
||||||
kwargs.pop(key)
|
for key in kwargs_keys:
|
||||||
else:
|
if key not in parser_kwargs:
|
||||||
for key in kwargs.keys():
|
kwargs.pop(key)
|
||||||
if key in lattice_kwargs:
|
|
||||||
kwargs.pop(key)
|
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/22726782
|
# https://stackoverflow.com/a/22726782
|
||||||
class TemporaryDirectory(object):
|
# and https://stackoverflow.com/questions/10965479
|
||||||
|
class TemporaryDirectory():
|
||||||
|
def __init__(self):
|
||||||
|
self.dir_path = None
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.name = tempfile.mkdtemp()
|
self.dir_path = tempfile.mkdtemp()
|
||||||
return self.name
|
# Only delete the temporary directory upon
|
||||||
|
# program exit.
|
||||||
|
atexit.register(shutil.rmtree, self.dir_path)
|
||||||
|
return self.dir_path
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
shutil.rmtree(self.name)
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def build_file_path_in_temp_dir(filename, extension=None):
|
||||||
|
"""Generates a new path within a temporary directory
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
extension : str
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
file_path_in_temporary_dir : str
|
||||||
|
|
||||||
|
"""
|
||||||
|
with TemporaryDirectory() as temp_dir:
|
||||||
|
if extension:
|
||||||
|
filename = filename + extension
|
||||||
|
path = os.path.join(
|
||||||
|
temp_dir,
|
||||||
|
filename
|
||||||
|
)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
|
|
@ -247,8 +295,9 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
j_x, j_y = zip(*tables[k])
|
j_x, j_y = zip(*tables[k])
|
||||||
j_x = [scale(j, scaling_factor_x) for j in j_x]
|
j_x = [scale(j, scaling_factor_x) for j in j_x]
|
||||||
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
|
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
|
||||||
joints = zip(j_x, j_y)
|
tables_new[(x1, y1, x2, y2)] = {
|
||||||
tables_new[(x1, y1, x2, y2)] = joints
|
"joints": list(zip(j_x, j_y))
|
||||||
|
}
|
||||||
|
|
||||||
v_segments_new = []
|
v_segments_new = []
|
||||||
for v in v_segments:
|
for v in v_segments:
|
||||||
|
|
@ -296,9 +345,10 @@ def get_rotation(chars, horizontal_text, vertical_text):
|
||||||
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||||
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
clockwise = sum(t.matrix[1] < 0 < t.matrix[2] for t in chars)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
anticlockwise = sum(t.matrix[1] > 0 > t.matrix[2] for t in chars)
|
||||||
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
|
rotation = "anticlockwise" if clockwise < anticlockwise \
|
||||||
|
else "clockwise"
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -329,18 +379,98 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
v_s = [
|
v_s = [
|
||||||
v
|
v
|
||||||
for v in v_segments
|
for v in v_segments
|
||||||
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
|
if v[1] > lb[1] - 2 and
|
||||||
|
v[3] < rt[1] + 2 and
|
||||||
|
lb[0] - 2 <= v[0] <= rt[0] + 2
|
||||||
]
|
]
|
||||||
h_s = [
|
h_s = [
|
||||||
h
|
h
|
||||||
for h in h_segments
|
for h in h_segments
|
||||||
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
|
if h[0] > lb[0] - 2 and
|
||||||
|
h[2] < rt[0] + 2 and
|
||||||
|
lb[1] - 2 <= h[1] <= rt[1] + 2
|
||||||
]
|
]
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
|
def get_textline_coords(textline):
|
||||||
|
"""Calculate the coordinates of each alignment for a given textline.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"left": textline.x0,
|
||||||
|
"right": textline.x1,
|
||||||
|
"middle": (textline.x0 + textline.x1) / 2.0,
|
||||||
|
"bottom": textline.y0,
|
||||||
|
"top": textline.y1,
|
||||||
|
"center": (textline.y0 + textline.y1) / 2.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_from_str(bbox_str):
|
||||||
|
"""Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox_str : str
|
||||||
|
Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2).
|
||||||
|
|
||||||
|
"""
|
||||||
|
x1, y1, x2, y2 = bbox_str.split(",")
|
||||||
|
x1 = float(x1)
|
||||||
|
y1 = float(y1)
|
||||||
|
x2 = float(x2)
|
||||||
|
y2 = float(y2)
|
||||||
|
return (
|
||||||
|
min(x1, x2),
|
||||||
|
min(y1, y2),
|
||||||
|
max(x1, x2),
|
||||||
|
max(y1, y2)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bboxes_overlap(bbox1, bbox2):
|
||||||
|
(left1, bottom1, right1, top1) = bbox1
|
||||||
|
(left2, bottom2, right2, top2) = bbox2
|
||||||
|
return (
|
||||||
|
(left1 < left2 < right1) or (left1 < right2 < right1)
|
||||||
|
) and (
|
||||||
|
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def textlines_overlapping_bbox(bbox, textlines):
|
||||||
|
"""Returns all text objects which overlap or are within a bounding box.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
textlines : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t_bbox : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
|
"""
|
||||||
|
t_bbox = [
|
||||||
|
t
|
||||||
|
for t in textlines
|
||||||
|
if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
|
||||||
|
]
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a bounding box.
|
"""Returns all text objects which lie at least 50% inside a bounding box
|
||||||
|
across both dimensions.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -367,6 +497,214 @@ def text_in_bbox(bbox, text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
|
||||||
|
"""Returns all text objects present inside a bounding box, split between
|
||||||
|
horizontal and vertical text.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
horizontal_text : List of PDFMiner text objects.
|
||||||
|
vertical_text : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict of lists of PDFMiner text objects that lie inside table, with one
|
||||||
|
key each for "horizontal" and "vertical"
|
||||||
|
|
||||||
|
"""
|
||||||
|
t_bbox = {}
|
||||||
|
t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text)
|
||||||
|
t_bbox["vertical"] = text_in_bbox(bbox, vertical_text)
|
||||||
|
|
||||||
|
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
|
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def expand_bbox_with_textline(bbox, textline):
|
||||||
|
"""Expand (if needed) a bbox so that it fits the parameter textline.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
min(bbox[0], textline.x0),
|
||||||
|
min(bbox[1], textline.y0),
|
||||||
|
max(bbox[2], textline.x1),
|
||||||
|
max(bbox[3], textline.y1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_from_textlines(textlines):
|
||||||
|
"""Returns the smallest bbox containing all the text objects passed as
|
||||||
|
a parameters.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
textlines : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(textlines) == 0:
|
||||||
|
return None
|
||||||
|
bbox = (
|
||||||
|
textlines[0].x0,
|
||||||
|
textlines[0].y0,
|
||||||
|
textlines[0].x1,
|
||||||
|
textlines[0].y1
|
||||||
|
)
|
||||||
|
|
||||||
|
for tl in textlines[1:]:
|
||||||
|
bbox = expand_bbox_with_textline(bbox, tl)
|
||||||
|
return bbox
|
||||||
|
|
||||||
|
|
||||||
|
def find_columns_boundaries(tls, min_gap=1.0):
|
||||||
|
"""Make a list of disjunct cols boundaries for a list of text objects
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
|
min_gap : minimum distance between columns. Any elements closer than
|
||||||
|
this threshold are merged together. This is to prevent spaces between
|
||||||
|
words to be misinterpreted as boundaries.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
boundaries : list
|
||||||
|
List x-coordinates for cols.
|
||||||
|
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
cols_bounds = []
|
||||||
|
tls.sort(key=lambda tl: tl.x0)
|
||||||
|
for tl in tls:
|
||||||
|
if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
|
||||||
|
cols_bounds.append([tl.x0, tl.x1])
|
||||||
|
else:
|
||||||
|
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||||
|
return cols_bounds
|
||||||
|
|
||||||
|
|
||||||
|
def find_rows_boundaries(tls, min_gap=1.0):
|
||||||
|
"""Make a list of disjunct rows boundaries for a list of text objects
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
|
min_gap : minimum distance between rows. Any elements closer than
|
||||||
|
this threshold are merged together.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
boundaries : list
|
||||||
|
List y-coordinates for rows.
|
||||||
|
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
|
||||||
|
|
||||||
|
"""
|
||||||
|
rows_bounds = []
|
||||||
|
tls.sort(key=lambda tl: tl.y0)
|
||||||
|
for tl in tls:
|
||||||
|
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
|
||||||
|
rows_bounds.append([tl.y0, tl.y1])
|
||||||
|
else:
|
||||||
|
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
|
||||||
|
return rows_bounds
|
||||||
|
|
||||||
|
|
||||||
|
def boundaries_to_split_lines(boundaries):
|
||||||
|
"""Find split lines given a list of boundaries between rows or cols.
|
||||||
|
|
||||||
|
Boundaries: [ a ] [b] [ c ] [d]
|
||||||
|
Splits: | | | | |
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
boundaries : list
|
||||||
|
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
|
||||||
|
These are the (left, right most) or (bottom, top most) coordinates.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
anchors : list
|
||||||
|
List of coordinates representing the split points, each half way
|
||||||
|
between boundaries
|
||||||
|
|
||||||
|
"""
|
||||||
|
# From the row boundaries, identify splits by getting the mid points
|
||||||
|
# between the boundaries.
|
||||||
|
anchors = list(map(
|
||||||
|
lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
|
||||||
|
range(1, len(boundaries))
|
||||||
|
))
|
||||||
|
anchors.insert(0, boundaries[0][0])
|
||||||
|
anchors.append(boundaries[-1][1])
|
||||||
|
return anchors
|
||||||
|
|
||||||
|
|
||||||
|
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
||||||
|
"""Return the index of the closest point in the sorted list.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
point : the reference sortable element to search.
|
||||||
|
sorted_list : list
|
||||||
|
fn: optional accessor function
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
index : int
|
||||||
|
|
||||||
|
"""
|
||||||
|
n = len(sorted_list)
|
||||||
|
if n == 0:
|
||||||
|
return None
|
||||||
|
if n == 1:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
left = 0
|
||||||
|
right = n - 1
|
||||||
|
mid = 0
|
||||||
|
|
||||||
|
if point >= fn(sorted_list[n - 1]):
|
||||||
|
return n - 1
|
||||||
|
if point <= fn(sorted_list[0]):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
while left < right:
|
||||||
|
mid = (left + right) // 2 # find the mid
|
||||||
|
mid_val = fn(sorted_list[mid])
|
||||||
|
if point < mid_val:
|
||||||
|
right = mid
|
||||||
|
elif point > mid_val:
|
||||||
|
left = mid + 1
|
||||||
|
else:
|
||||||
|
return mid
|
||||||
|
|
||||||
|
if mid_val > point:
|
||||||
|
if mid > 0 and (
|
||||||
|
point - fn(sorted_list[mid-1]) <
|
||||||
|
mid_val - point):
|
||||||
|
return mid-1
|
||||||
|
elif mid_val < point:
|
||||||
|
if mid < n - 1 and (
|
||||||
|
fn(sorted_list[mid+1]) - point <
|
||||||
|
point - mid_val):
|
||||||
|
return mid+1
|
||||||
|
return mid
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_tol=2):
|
def merge_close_lines(ar, line_tol=2):
|
||||||
"""Merges lines which are within a tolerance by calculating a
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
moving mean, based on their x or y axis projections.
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
@ -452,10 +790,10 @@ def flag_font_size(textline, direction, strip_text=""):
|
||||||
for t in textline
|
for t in textline
|
||||||
if not isinstance(t, LTAnno)
|
if not isinstance(t, LTAnno)
|
||||||
]
|
]
|
||||||
l = [np.round(size, decimals=6) for text, size in d]
|
text_sizes = [np.round(size, decimals=6) for text, size in d]
|
||||||
if len(set(l)) > 1:
|
if len(set(text_sizes)) > 1:
|
||||||
flist = []
|
flist = []
|
||||||
min_size = min(l)
|
min_size = min(text_sizes)
|
||||||
for key, chars in groupby(d, itemgetter(1)):
|
for key, chars in groupby(d, itemgetter(1)):
|
||||||
if key == min_size:
|
if key == min_size:
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
|
|
@ -469,12 +807,12 @@ def flag_font_size(textline, direction, strip_text=""):
|
||||||
flist.append("".join(fchars))
|
flist.append("".join(fchars))
|
||||||
fstring = "".join(flist)
|
fstring = "".join(flist)
|
||||||
else:
|
else:
|
||||||
fstring = "".join([t.get_text() for t in textline])
|
fstring = "".join(t.get_text() for t in textline)
|
||||||
return text_strip(fstring, strip_text)
|
return text_strip(fstring, strip_text)
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
"""Split PDFMiner LTTextLine into substrings if it spans across
|
||||||
multiple rows/columns.
|
multiple rows/columns.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -499,7 +837,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
of row/column and text is the an lttextline substring.
|
of row/column and text is the an lttextline substring.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
idx = 0
|
|
||||||
cut_text = []
|
cut_text = []
|
||||||
bbox = textline.bbox
|
bbox = textline.bbox
|
||||||
try:
|
try:
|
||||||
|
|
@ -516,7 +853,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
]
|
]
|
||||||
r = r_idx[0]
|
r = r_idx[0]
|
||||||
x_cuts = [
|
x_cuts = [
|
||||||
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
|
(c, table.cells[r][c].x2)
|
||||||
|
for c in x_overlap
|
||||||
|
if table.cells[r][c].right
|
||||||
]
|
]
|
||||||
if not x_cuts:
|
if not x_cuts:
|
||||||
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||||
|
|
@ -530,10 +869,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
):
|
):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
break
|
break
|
||||||
else:
|
# TODO: add test
|
||||||
# TODO: add test
|
if cut == x_cuts[-1]:
|
||||||
if cut == x_cuts[-1]:
|
cut_text.append((r, cut[0] + 1, obj))
|
||||||
cut_text.append((r, cut[0] + 1, obj))
|
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
elif direction == "vertical" and not textline.is_empty():
|
elif direction == "vertical" and not textline.is_empty():
|
||||||
|
|
@ -549,7 +887,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
]
|
]
|
||||||
c = c_idx[0]
|
c = c_idx[0]
|
||||||
y_cuts = [
|
y_cuts = [
|
||||||
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
|
(r, table.cells[r][c].y1)
|
||||||
|
for r in y_overlap
|
||||||
|
if table.cells[r][c].bottom
|
||||||
]
|
]
|
||||||
if not y_cuts:
|
if not y_cuts:
|
||||||
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||||
|
|
@ -557,16 +897,13 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
col = table.cols[c]
|
col = table.cols[c]
|
||||||
for cut in y_cuts:
|
for cut in y_cuts:
|
||||||
if isinstance(obj, LTChar):
|
if isinstance(obj, LTChar):
|
||||||
if (
|
if col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] \
|
||||||
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
|
and (obj.y0 + obj.y1) / 2 >= cut[1]:
|
||||||
and (obj.y0 + obj.y1) / 2 >= cut[1]
|
|
||||||
):
|
|
||||||
cut_text.append((cut[0], c, obj))
|
cut_text.append((cut[0], c, obj))
|
||||||
break
|
break
|
||||||
else:
|
# TODO: add test
|
||||||
# TODO: add test
|
if cut == y_cuts[-1]:
|
||||||
if cut == y_cuts[-1]:
|
cut_text.append((cut[0] - 1, c, obj))
|
||||||
cut_text.append((cut[0] - 1, c, obj))
|
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((cut[0], c, obj))
|
cut_text.append((cut[0], c, obj))
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
|
@ -632,9 +969,8 @@ def get_table_index(
|
||||||
"""
|
"""
|
||||||
r_idx, c_idx = [-1] * 2
|
r_idx, c_idx = [-1] * 2
|
||||||
for r in range(len(table.rows)):
|
for r in range(len(table.rows)):
|
||||||
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
|
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
|
||||||
r
|
(t.y0 + t.y1) / 2.0 > table.rows[r][1]:
|
||||||
][1]:
|
|
||||||
lt_col_overlap = []
|
lt_col_overlap = []
|
||||||
for c in table.cols:
|
for c in table.cols:
|
||||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||||
|
|
@ -648,7 +984,8 @@ def get_table_index(
|
||||||
text_range = (t.x0, t.x1)
|
text_range = (t.x0, t.x1)
|
||||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"{text} {text_range} does not lie in column range {col_range}"
|
f"{text} {text_range} does not lie in column range "
|
||||||
|
f"{col_range}"
|
||||||
)
|
)
|
||||||
r_idx = r
|
r_idx = r
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
|
|
@ -667,7 +1004,9 @@ def get_table_index(
|
||||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||||
charea = X * Y
|
charea = X * Y
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = (
|
||||||
|
(X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
|
||||||
|
) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return (
|
return (
|
||||||
|
|
@ -676,20 +1015,21 @@ def get_table_index(
|
||||||
),
|
),
|
||||||
error,
|
error,
|
||||||
)
|
)
|
||||||
else:
|
if flag_size:
|
||||||
if flag_size:
|
return (
|
||||||
return (
|
[
|
||||||
[
|
(
|
||||||
(
|
r_idx,
|
||||||
r_idx,
|
c_idx,
|
||||||
c_idx,
|
flag_font_size(t._objs,
|
||||||
flag_font_size(t._objs, direction, strip_text=strip_text),
|
direction,
|
||||||
)
|
strip_text=strip_text),
|
||||||
],
|
)
|
||||||
error,
|
],
|
||||||
)
|
error,
|
||||||
else:
|
)
|
||||||
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
|
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
|
||||||
|
error
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
|
|
@ -711,7 +1051,7 @@ def compute_accuracy(error_weights):
|
||||||
SCORE_VAL = 100
|
SCORE_VAL = 100
|
||||||
try:
|
try:
|
||||||
score = 0
|
score = 0
|
||||||
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
if sum(ew[0] for ew in error_weights) != SCORE_VAL:
|
||||||
raise ValueError("Sum of weights should be equal to 100.")
|
raise ValueError("Sum of weights should be equal to 100.")
|
||||||
for ew in error_weights:
|
for ew in error_weights:
|
||||||
weight = ew[0] / len(ew[1])
|
weight = ew[0] / len(ew[1])
|
||||||
|
|
@ -737,7 +1077,6 @@ def compute_whitespace(d):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
whitespace = 0
|
whitespace = 0
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
|
||||||
for i in d:
|
for i in d:
|
||||||
for j in i:
|
for j in i:
|
||||||
if j.strip() == "":
|
if j.strip() == "":
|
||||||
|
|
@ -747,13 +1086,12 @@ def compute_whitespace(d):
|
||||||
|
|
||||||
|
|
||||||
def get_page_layout(
|
def get_page_layout(
|
||||||
filename,
|
filename,
|
||||||
char_margin=1.0,
|
char_margin=1.0,
|
||||||
line_margin=0.5,
|
line_margin=0.5,
|
||||||
word_margin=0.1,
|
word_margin=0.1,
|
||||||
detect_vertical=True,
|
detect_vertical=True,
|
||||||
all_texts=True,
|
all_texts=True):
|
||||||
):
|
|
||||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||||
of kwargs.
|
of kwargs.
|
||||||
|
|
@ -797,6 +1135,7 @@ def get_page_layout(
|
||||||
width = layout.bbox[2]
|
width = layout.bbox[2]
|
||||||
height = layout.bbox[3]
|
height = layout.bbox[3]
|
||||||
dim = (width, height)
|
dim = (width, height)
|
||||||
|
break # we assume a single page pdf
|
||||||
return layout, dim
|
return layout, dim
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -838,3 +1177,117 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def export_pdf_as_png(pdf_path, destination_path, resolution=300):
|
||||||
|
"""Generate an image from a pdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
pdf_path : str
|
||||||
|
destination_path : str
|
||||||
|
resolution : int
|
||||||
|
"""
|
||||||
|
gs_call = "-q -sDEVICE=png16m -o " \
|
||||||
|
"{destination_path} -r{resolution} {pdf_path}" \
|
||||||
|
.format(
|
||||||
|
destination_path=destination_path,
|
||||||
|
resolution=resolution,
|
||||||
|
pdf_path=pdf_path
|
||||||
|
)
|
||||||
|
gs_call = gs_call.encode().split()
|
||||||
|
null = open(os.devnull, "wb")
|
||||||
|
Ghostscript(*gs_call, stdout=null)
|
||||||
|
null.close()
|
||||||
|
|
||||||
|
|
||||||
|
def compare_tables(left, right):
|
||||||
|
"""Compare two tables and displays differences in a human readable form.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
left : data frame
|
||||||
|
right : data frame
|
||||||
|
"""
|
||||||
|
diff_cols = right.shape[1]-left.shape[1]
|
||||||
|
diff_rows = right.shape[0]-left.shape[0]
|
||||||
|
differences = []
|
||||||
|
if diff_rows:
|
||||||
|
differences.append(
|
||||||
|
"{diff_rows} {more_fewer} rows".format(
|
||||||
|
diff_rows=abs(diff_rows),
|
||||||
|
more_fewer='more' if diff_rows > 0 else 'fewer'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if diff_cols:
|
||||||
|
differences.append(
|
||||||
|
"{diff_cols} {more_fewer} columns".format(
|
||||||
|
diff_cols=abs(diff_cols),
|
||||||
|
more_fewer='more' if diff_cols > 0 else 'fewer'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if differences:
|
||||||
|
differences_str = " and ".join(differences)
|
||||||
|
print(
|
||||||
|
"Right has {differences_str} than left "
|
||||||
|
"{shape_left} vs {shape_right}".format(
|
||||||
|
differences_str=differences_str,
|
||||||
|
shape_left=[left.shape[0], left.shape[1]],
|
||||||
|
shape_right=[right.shape[0], right.shape[1]],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
table1, table2 = [left, right]
|
||||||
|
name_table1, name_table2 = ["left", "right"]
|
||||||
|
if not diff_cols:
|
||||||
|
# Same number of cols: compare rows since they're of the same length
|
||||||
|
if diff_rows > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for index, lrow in table1.iterrows():
|
||||||
|
if index < table2.shape[0]:
|
||||||
|
srow = table2.loc[index, :]
|
||||||
|
if not lrow.equals(srow):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df = diff_df.append(lrow, ignore_index=True)
|
||||||
|
diff_df = diff_df.append(srow, ignore_index=True)
|
||||||
|
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
||||||
|
print("Row {index} differs:".format(index=index))
|
||||||
|
print(diff_df.values)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Row {index} unique to {name_table1}: {lrow}".format(
|
||||||
|
index=index,
|
||||||
|
name_table1=name_table1,
|
||||||
|
lrow=lrow
|
||||||
|
))
|
||||||
|
break
|
||||||
|
elif not diff_rows:
|
||||||
|
# Same number of rows: compare columns since they're of the same length
|
||||||
|
if diff_cols > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for i, col in enumerate(table1.columns):
|
||||||
|
lcol = table1.iloc[:, i]
|
||||||
|
if col in table2:
|
||||||
|
scol = table2.iloc[:, i]
|
||||||
|
if not lcol.equals(scol):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df[name_table1] = scol
|
||||||
|
diff_df[name_table2] = lcol
|
||||||
|
diff_df["Match"] = lcol == scol
|
||||||
|
print(
|
||||||
|
"Column {i} different:\n"
|
||||||
|
"{diff_df}".format(
|
||||||
|
i=i,
|
||||||
|
diff_df=diff_df
|
||||||
|
)
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Column {i} unique to {name_table1}: {lcol}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Tables have different shapes")
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
|
||||||
|
|
||||||
$ conda install -c conda-forge camelot-py
|
$ conda install -c conda-forge camelot-py
|
||||||
|
|
||||||
.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
|
.. note:: Camelot is available for Python 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
|
||||||
|
|
||||||
.. _conda: https://conda.io/docs/
|
.. _conda: https://conda.io/docs/
|
||||||
.. _Anaconda: http://docs.continuum.io/anaconda/
|
.. _Anaconda: http://docs.continuum.io/anaconda/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,351 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Hybrid Parser step-by-step\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook describes the algorithms behind the hybrid parser, which blends the results of the network parser (text based) and the lattice parser (image based).\n",
|
||||||
|
"\n",
|
||||||
|
"You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with camelot.\n",
|
||||||
|
"\n",
|
||||||
|
"You can also use the `parser-comparison-notebook` notebook to compare the parsers results side-by-side."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Bootstrap and common imports\n",
|
||||||
|
"import os, sys, time\n",
|
||||||
|
"sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
|
||||||
|
"import camelot\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Select a pdf to analyze.\n",
|
||||||
|
"kwargs = {}\n",
|
||||||
|
"data = None\n",
|
||||||
|
"# pdf_file = \"vertical_header.pdf\" # test_network_vertical_header\n",
|
||||||
|
"# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_network_flag_size\n",
|
||||||
|
"# pdf_file = \"health.pdf\" # test_network\n",
|
||||||
|
"# pdf_file = \"clockwise_table_2.pdf\"\n",
|
||||||
|
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
|
||||||
|
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_network_table_regions\n",
|
||||||
|
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_areas\": [\"320,500,573,335\"]} # test_network_table_areas\n",
|
||||||
|
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
|
||||||
|
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
|
||||||
|
"# pdf_file = \"clockwise_table_2.pdf\" # test_network_table_rotated / test_stream_table_rotated\n",
|
||||||
|
"pdf_file = \"vertical_header.pdf\"\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file = \"twotables_2.pdf\"\n",
|
||||||
|
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
|
||||||
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
|
||||||
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
|
||||||
|
"\n",
|
||||||
|
"filename = os.path.join(\n",
|
||||||
|
" os.path.dirname(os.path.abspath('.')),\n",
|
||||||
|
" \"camelot/tests/files\",\n",
|
||||||
|
" pdf_file\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Set up plotting options\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"%matplotlib inline\n",
|
||||||
|
"PLOT_HEIGHT = 12\n",
|
||||||
|
"def init_figure_and_axis(title):\n",
|
||||||
|
" fig = plt.figure(figsize=(PLOT_HEIGHT * 2.5, PLOT_HEIGHT))\n",
|
||||||
|
" ax = fig.add_subplot(111)\n",
|
||||||
|
" ax.set_title(title)\n",
|
||||||
|
" return fig, ax\n",
|
||||||
|
"\n",
|
||||||
|
"# Utility function to display tables\n",
|
||||||
|
"def display_parse_results(tables, parse_time, flavor):\n",
|
||||||
|
" if not tables:\n",
|
||||||
|
" return\n",
|
||||||
|
" tables_dims = \", \".join(\n",
|
||||||
|
" map(\n",
|
||||||
|
" lambda table: \"{rows}x{cols}\".format(\n",
|
||||||
|
" rows=table.shape[0],\n",
|
||||||
|
" cols=table.shape[1],\n",
|
||||||
|
" ), tables\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
" print(f\"The {flavor} parser found {len(tables)} table(s) ({tables_dims}) in {parse_time:.2f}s\")\n",
|
||||||
|
" for table in tables:\n",
|
||||||
|
" display(table.df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Overall Algorithm\n",
|
||||||
|
"\n",
|
||||||
|
"The hybrid parser combines results from the network parser and the lattice parser to get the \"best of both worlds.\" Before we look at the combination itself, let's see how each of the two parsers work.\n",
|
||||||
|
"\n",
|
||||||
|
"### Network parser\n",
|
||||||
|
"\n",
|
||||||
|
"The network parser is text-based: it relies on the bounding boxes of the text elements encoded in the .pdf document to identify patterns indicative of a table.\n",
|
||||||
|
"\n",
|
||||||
|
"The plot belows shows the bounding boxes of all the text elements on the parsed document, in light blue for horizontal elements, light red for vertical elements (rare in most documents)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Parse file\n",
|
||||||
|
"flavor = \"network\"\n",
|
||||||
|
"timer_before_parse = time.perf_counter()\n",
|
||||||
|
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
|
||||||
|
"timer_after_parse = time.perf_counter()\n",
|
||||||
|
"\n",
|
||||||
|
"if tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(tables[0], kind=\"text\", ax=ax)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No table found for this document.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Network parser - step 1: Identify a network of connected alignments\n",
|
||||||
|
"\n",
|
||||||
|
"The network parser starts by identifying common horizontal (shown in green on the plot below) or vertical (in blue) coordinate alignments across these text elements. In other words it looks for bounding box rectangles which either share the same top, center, or bottom coordinates (horizontal axis), or the same left, right, or middle coordinates (vertical axis). See the `generate` method.\n",
|
||||||
|
"\n",
|
||||||
|
"Once the parser found these alignments, it performs some pruning to only keep text elements that are part of a network - they have connections along both axis The idea is that it's not enough for two elements to be aligned to belong to a table, for instance the lines of text in this paragraph are all left-aligned, but they do not form a network. The pruning is done iteratively, see `remove_unconnected_edges` method.\n",
|
||||||
|
"\n",
|
||||||
|
"Once the network is pruned, the parser keeps track of how many alignments each text element belongs to: that's the number on top (vertical alignments) or to the left of each alignment in the plot below. The text element with the most connections (in red on the plot) is the starting point -the *seed*- of the next step. Finally, the parser measures how far the alignments are from one another, to determine a plausible search zone around each cell for the next stage of growing the table. See `compute_plausible_gaps` method."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"No table found for document {pdf_file}.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Network parser - step 2: Detect table body iteratively from seed\n",
|
||||||
|
"\n",
|
||||||
|
"In the next step, the parser iteratively \"grows\" a table, starting from the seed identified in the previous step. The bounding box is initialized with the bounding box of the seed, then it iteratively searches for text elements that are close to the bounding box, then grows the table to ingest them, until there are no more text elements to ingest. The two steps are:\n",
|
||||||
|
"* Search: create a search bounding box by expanding the current table bounding box in all directions, based on the plausible gap numbers determined above. Search bounding boxes are shown in orange on the graph below. \n",
|
||||||
|
"* Grow: if a networked text element is found in this search area, expand the table bounding box so that it includes this new element. Each successive table bounding box is shown in red in the plot below.\n",
|
||||||
|
"\n",
|
||||||
|
"Notice in the plot below how the search area and the table bounding box grow starting from the seed. See method `search_table_body`.\n",
|
||||||
|
"\n",
|
||||||
|
"#### Network parser - step 3: Search for a header section\n",
|
||||||
|
"\n",
|
||||||
|
"Headers are often aligned differently from the rest of the table. To account for this, the network parser searches for text elements that are good candidates for a header section: these text elements are just above the bounding box of the body of the table, and they fit within the rows identified in the table body. See the method `search_header_from_body_bbox`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No table found for this document.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Network parser - step 4: Repeat\n",
|
||||||
|
"\n",
|
||||||
|
"There are sometimes multiple tables on one page. So once a first table is identified, all the text edges it contains are removed, and the algorithm is repeated until no new network is identified.\n",
|
||||||
|
"\n",
|
||||||
|
"The final parse for this .pdf is as follows:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Lattice parser\n",
|
||||||
|
"\n",
|
||||||
|
"The lattice parser is based on an analyzis of the image from the .pdf, rather than its text content. It relies on the borders of the tables to be solid vertical lines.\n",
|
||||||
|
"\n",
|
||||||
|
"#### Lattice parser - step 1: Identify solid lines within the document.\n",
|
||||||
|
"\n",
|
||||||
|
"The lattice parser relies on the OpenCV library (`getStructuringElement` function) to detect all solid vertical and horizontal lines within the document."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Parse file\n",
|
||||||
|
"flavor = \"lattice\"\n",
|
||||||
|
"timer_before_parse = time.perf_counter()\n",
|
||||||
|
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
|
||||||
|
"timer_after_parse = time.perf_counter()\n",
|
||||||
|
"\n",
|
||||||
|
"if tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(tables[0], kind=\"line\", ax=ax)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No table found for this document.\")\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Lattice parser - step 2: Find the contours of the table(s) based on the solid lines.\n",
|
||||||
|
"\n",
|
||||||
|
"The lattice parser then uses OpenCV's `findContours` function to detect the overall bounding box of the table(s), since the solid lines might draw more than one table."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for table in tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(table, kind=\"contour\", ax=ax)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Lattice parser - step 3: Identify joints\n",
|
||||||
|
"\n",
|
||||||
|
"For each table bounding box (contour), the lattice parser then makes a list of all the intersections between vertical and horizontal lines: the joints."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for table in tables:\n",
|
||||||
|
" fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n",
|
||||||
|
" camelot.plot(table, kind=\"joint\", ax=ax)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Lattice parser - step 4: Identify rows and columns\n",
|
||||||
|
"\n",
|
||||||
|
"In the final step, the algorithm sorts all the x coordinates of the joints to identify the position of the table's columns, and the y coordinates for the table's rows. See method `_generate_columns_and_rows`.\n",
|
||||||
|
"\n",
|
||||||
|
"The resulting lattice parse for the .pdf is as follows."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Combining results of Network and Lattice with the Hybrid parser\n",
|
||||||
|
"\n",
|
||||||
|
"The hybrid parser aims to combine the strengths of the Network parser (identifying cells based on text alignments) and of the Lattice parser (relying on solid lines to determine tables rows and columns boundaries).\n",
|
||||||
|
"\n",
|
||||||
|
"#### Hybrid parser - step 1: Apply both parsers table bounding box detection techniques to the document\n",
|
||||||
|
"\n",
|
||||||
|
"In this step, hybrid calls both parsers, to get a) the standard table parse, b) the coordinates of the rows and columns boundaries, and c) the table boundaries (or contour).\n",
|
||||||
|
"\n",
|
||||||
|
"#### Hybrid parser - step 2: Merge the results\n",
|
||||||
|
"\n",
|
||||||
|
"If there are areas in the document where both lattice and network found a table, the hybrid parser uses the results from network, but enhances them based on the rows/columns boundaries identified by lattice in the area. Because lattice uses the solid lines detected on the document, the coordinates for b) and c) detected by Lattice are generally more precise. See the `_merge_bbox_analysis` method.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"flavor = \"hybrid\"\n",
|
||||||
|
"timer_before_parse = time.perf_counter()\n",
|
||||||
|
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
|
||||||
|
"timer_after_parse = time.perf_counter()\n",
|
||||||
|
"\n",
|
||||||
|
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"version": "3.7.7-final"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 2,
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"npconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": 3,
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
|
||||||
|
"display_name": "Python 3.7.7 64-bit"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,201 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Parser comparison\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Bootstrap and common imports\n",
|
||||||
|
"import os, sys, time\n",
|
||||||
|
"sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
|
||||||
|
"import camelot\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Select a PDF file to review\n",
|
||||||
|
"\n",
|
||||||
|
"This is seeded with the unit test files for convenience."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"kwargs = {}\n",
|
||||||
|
"data = None\n",
|
||||||
|
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
|
||||||
|
"# pdf_file = \"health.pdf\" # test_hybrid\n",
|
||||||
|
"# pdf_file = \"clockwise_table_2.pdf\"\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n",
|
||||||
|
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
|
||||||
|
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
|
||||||
|
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
|
||||||
|
"pdf_file = \"vertical_header.pdf\"\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
|
||||||
|
"\n",
|
||||||
|
"# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
|
||||||
|
"# pdf_file = \"twotables_2.pdf\" # Lattice is better\n",
|
||||||
|
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
|
||||||
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
|
||||||
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
|
||||||
|
"# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n",
|
||||||
|
"# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True} # test_lattice_process_background\n",
|
||||||
|
"\n",
|
||||||
|
"filename = os.path.join(\n",
|
||||||
|
" os.path.dirname(os.path.abspath('.')),\n",
|
||||||
|
" \"camelot/tests/files\",\n",
|
||||||
|
" pdf_file\n",
|
||||||
|
")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
|
||||||
|
"tables_parsed = {}\n",
|
||||||
|
"parses = {}\n",
|
||||||
|
"max_tables = 0\n",
|
||||||
|
"for idx, flavor in enumerate(FLAVORS):\n",
|
||||||
|
" timer_before_parse = time.perf_counter()\n",
|
||||||
|
" error, tables = None, []\n",
|
||||||
|
" try:\n",
|
||||||
|
" tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
|
||||||
|
" except ValueError as value_error:\n",
|
||||||
|
" error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
|
||||||
|
" print(error)\n",
|
||||||
|
" timer_after_parse = time.perf_counter()\n",
|
||||||
|
" max_tables = max(max_tables, len(tables))\n",
|
||||||
|
"\n",
|
||||||
|
" parses[flavor] = {\n",
|
||||||
|
" \"tables\": tables,\n",
|
||||||
|
" \"time\": timer_after_parse - timer_before_parse,\n",
|
||||||
|
" \"error\": error\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"##### {flavor} ####\")\n",
|
||||||
|
" print(f\"Found {len(tables)} table(s):\")\n",
|
||||||
|
" for idx, table in enumerate(tables):\n",
|
||||||
|
" flavors_matching = []\n",
|
||||||
|
" for previous_flavor, previous_tables in tables_parsed.items():\n",
|
||||||
|
" for prev_idx, previous_table in enumerate(previous_tables):\n",
|
||||||
|
" if previous_table.df.equals(table.df):\n",
|
||||||
|
" flavors_matching.append(\n",
|
||||||
|
" f\"{previous_flavor} table {prev_idx}\")\n",
|
||||||
|
" print(f\"## Table {idx} ##\")\n",
|
||||||
|
" if flavors_matching:\n",
|
||||||
|
" print(f\"Same as {', '.join(flavors_matching)}.\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" display(table.df)\n",
|
||||||
|
" print(\"\")\n",
|
||||||
|
" tables_parsed[flavor] = tables\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Show tables layout within original document"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"# Set up plotting options\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"%matplotlib inline\n",
|
||||||
|
"PLOT_HEIGHT = 12\n",
|
||||||
|
"\n",
|
||||||
|
"row_count = max(max_tables, 1)\n",
|
||||||
|
"plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n",
|
||||||
|
"fig, axes = plt.subplots(row_count, len(FLAVORS))\n",
|
||||||
|
"plt.subplots_adjust(wspace=0, hspace=0) # Reduce margins to maximize the display zone\n",
|
||||||
|
"\n",
|
||||||
|
"fig.suptitle('Side-by-side flavor comparison', fontsize=24, fontweight='bold')\n",
|
||||||
|
"for idx, flavor in enumerate(FLAVORS):\n",
|
||||||
|
" parse = parses[flavor]\n",
|
||||||
|
" tables = parse[\"tables\"]\n",
|
||||||
|
" top_ax = axes.flat[idx]\n",
|
||||||
|
" title = f\"{flavor}\\n\" \\\n",
|
||||||
|
" f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n",
|
||||||
|
" if parse['error']:\n",
|
||||||
|
" title = title + f\"\\nError parsing: {parse['error']}\"\n",
|
||||||
|
" top_ax.set_title(title, fontsize=12, fontweight='bold')\n",
|
||||||
|
" for table_idx, table in enumerate(tables):\n",
|
||||||
|
" if max_tables > 1:\n",
|
||||||
|
" ax = axes[table_idx][idx]\n",
|
||||||
|
" else:\n",
|
||||||
|
" ax = axes[idx]\n",
|
||||||
|
" fig = camelot.plot(table, kind='grid', ax=ax)\n",
|
||||||
|
" ax.text(\n",
|
||||||
|
" 0.5,-0.1, \n",
|
||||||
|
" \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n",
|
||||||
|
" flavor=flavor,\n",
|
||||||
|
" table_idx=table_idx,\n",
|
||||||
|
" rows=table.shape[0],\n",
|
||||||
|
" cols=table.shape[1],\n",
|
||||||
|
" ), \n",
|
||||||
|
" size=14, ha=\"center\", \n",
|
||||||
|
" transform=ax.transAxes\n",
|
||||||
|
" )\n",
|
||||||
|
" timer_after_plot = time.perf_counter()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"version": "3.7.7-final"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 2,
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"npconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": 3,
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
|
||||||
|
"display_name": "Python 3.7.7 64-bit"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
|
|
@ -5,6 +5,6 @@ numpy>=1.13.3
|
||||||
opencv-python>=3.4.2.17
|
opencv-python>=3.4.2.17
|
||||||
openpyxl>=2.5.8
|
openpyxl>=2.5.8
|
||||||
pandas>=0.23.4
|
pandas>=0.23.4
|
||||||
pdfminer.six>=20170720
|
pdfminer.six>=20200402
|
||||||
PyPDF2>=1.26.0
|
PyPDF2>=1.26.0
|
||||||
Sphinx>=1.7.9
|
Sphinx>=1.7.9
|
||||||
|
|
|
||||||
|
|
@ -3,4 +3,6 @@ test=pytest
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
|
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
|
||||||
|
# Switch to no-cov if you want to debug a test with breakpoints.
|
||||||
|
# addopts = --verbose --mpl
|
||||||
python_files = tests/test_*.py
|
python_files = tests/test_*.py
|
||||||
|
|
|
||||||
16
setup.py
|
|
@ -19,7 +19,7 @@ requires = [
|
||||||
'numpy>=1.13.3',
|
'numpy>=1.13.3',
|
||||||
'openpyxl>=2.5.8',
|
'openpyxl>=2.5.8',
|
||||||
'pandas>=0.23.4',
|
'pandas>=0.23.4',
|
||||||
'pdfminer.six>=20170720',
|
'pdfminer.six>=20200402',
|
||||||
'PyPDF2>=1.26.0'
|
'PyPDF2>=1.26.0'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -32,12 +32,12 @@ plot_requires = [
|
||||||
]
|
]
|
||||||
|
|
||||||
dev_requires = [
|
dev_requires = [
|
||||||
'codecov>=2.0.15',
|
'codecov>=2.1.3',
|
||||||
'pytest>=3.8.0',
|
'pytest>=4.6',
|
||||||
'pytest-cov>=2.6.0',
|
'pytest-cov>=2.10.0',
|
||||||
'pytest-mpl>=0.10',
|
'pytest-mpl>=0.11',
|
||||||
'pytest-runner>=4.2',
|
'pytest-runner>=5.2',
|
||||||
'Sphinx>=1.7.9'
|
'Sphinx>=3.0.3'
|
||||||
]
|
]
|
||||||
|
|
||||||
all_requires = cv_requires + plot_requires
|
all_requires = cv_requires + plot_requires
|
||||||
|
|
@ -69,7 +69,7 @@ def setup_package():
|
||||||
},
|
},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
# Trove classifiers
|
# Trove classifiers
|
||||||
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
|
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa
|
||||||
'License :: OSI Approved :: MIT License',
|
'License :: OSI Approved :: MIT License',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
|
|
|
||||||
1348
tests/data.py
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 47 KiB |
|
After Width: | Height: | Size: 103 KiB |
|
After Width: | Height: | Size: 48 KiB |
|
After Width: | Height: | Size: 88 KiB |
|
After Width: | Height: | Size: 90 KiB |
|
After Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 101 KiB |
|
After Width: | Height: | Size: 49 KiB |
|
After Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 59 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 113 KiB |
|
|
@ -19,10 +19,16 @@ def test_help_output():
|
||||||
output = result.output
|
output = result.output
|
||||||
|
|
||||||
assert prog_name == "camelot"
|
assert prog_name == "camelot"
|
||||||
assert result.output.startswith("Usage: %(prog_name)s [OPTIONS] COMMAND" % locals())
|
assert result.output.startswith(
|
||||||
|
"Usage: %(prog_name)s [OPTIONS] COMMAND" %
|
||||||
|
locals()
|
||||||
|
)
|
||||||
assert all(
|
assert all(
|
||||||
v in result.output
|
v in result.output
|
||||||
for v in ["Options:", "--version", "--help", "Commands:", "lattice", "stream"]
|
for v in [
|
||||||
|
"Options:", "--version", "--help", "Commands:", "lattice",
|
||||||
|
"stream"
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -66,6 +72,26 @@ def test_cli_stream():
|
||||||
assert format_error in result.output
|
assert format_error in result.output
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_network():
|
||||||
|
with TemporaryDirectory() as tempdir:
|
||||||
|
infile = os.path.join(testdir, "budget.pdf")
|
||||||
|
outfile = os.path.join(tempdir, "budget.csv")
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(
|
||||||
|
cli, ["--format", "csv", "--output", outfile, "network", infile]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert result.output == "Found 1 tables\n"
|
||||||
|
|
||||||
|
result = runner.invoke(cli, ["--format", "csv", "network", infile])
|
||||||
|
output_error = "Error: Please specify output file path using --output"
|
||||||
|
assert output_error in result.output
|
||||||
|
|
||||||
|
result = runner.invoke(cli, ["--output", outfile, "network", infile])
|
||||||
|
format_error = "Please specify output file format using --format"
|
||||||
|
assert format_error in result.output
|
||||||
|
|
||||||
|
|
||||||
def test_cli_password():
|
def test_cli_password():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, "health_protected.pdf")
|
infile = os.path.join(testdir, "health_protected.pdf")
|
||||||
|
|
@ -121,7 +147,8 @@ def test_cli_output_format():
|
||||||
outfile = os.path.join(tempdir, "health.json")
|
outfile = os.path.join(tempdir, "health.json")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "json", "--output", outfile, "stream", infile],
|
["--format", "json", "--output", outfile.format("json"), "stream",
|
||||||
|
infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
@ -129,7 +156,8 @@ def test_cli_output_format():
|
||||||
outfile = os.path.join(tempdir, "health.xlsx")
|
outfile = os.path.join(tempdir, "health.xlsx")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "excel", "--output", outfile, "stream", infile],
|
["--format", "excel", "--output", outfile.format("xlsx"), "stream",
|
||||||
|
infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
@ -137,7 +165,8 @@ def test_cli_output_format():
|
||||||
outfile = os.path.join(tempdir, "health.html")
|
outfile = os.path.join(tempdir, "health.html")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "html", "--output", outfile, "stream", infile],
|
["--format", "html", "--output", outfile.format("html"), "stream",
|
||||||
|
infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
@ -170,6 +199,10 @@ def test_cli_quiet():
|
||||||
assert "No tables found on page-1" in result.output
|
assert "No tables found on page-1" in result.output
|
||||||
|
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli, ["--quiet", "--format", "csv", "--output", outfile, "stream", infile]
|
cli,
|
||||||
|
[
|
||||||
|
"--quiet", "--format", "csv", "--output", outfile, "stream",
|
||||||
|
infile
|
||||||
|
]
|
||||||
)
|
)
|
||||||
assert "No tables found on page-1" not in result.output
|
assert "No tables found on page-1" not in result.output
|
||||||
|
|
|
||||||
|
|
@ -8,15 +8,20 @@ from pandas.testing import assert_frame_equal
|
||||||
import camelot
|
import camelot
|
||||||
from camelot.core import Table, TableList
|
from camelot.core import Table, TableList
|
||||||
from camelot.__version__ import generate_version
|
from camelot.__version__ import generate_version
|
||||||
|
# compare_tables used in console mode while debugging
|
||||||
|
from camelot.utils import compare_tables # noqa
|
||||||
|
|
||||||
from .data import *
|
from .data import *
|
||||||
|
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
|
||||||
def test_parsing_report():
|
def test_parsing_report():
|
||||||
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
parsing_report = {
|
||||||
|
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
|
||||||
|
}
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
@ -28,9 +33,11 @@ def test_password():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health_protected.pdf")
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -143,6 +150,194 @@ def test_stream_layout_kwargs():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network():
|
||||||
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_table_rotated():
|
||||||
|
df = pd.DataFrame(data_network_table_rotated)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_two_tables_a():
|
||||||
|
df1 = pd.DataFrame(data_network_two_tables_1)
|
||||||
|
df2 = pd.DataFrame(data_network_two_tables_2)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
|
||||||
|
assert len(tables) == 2
|
||||||
|
assert df1.equals(tables[0].df)
|
||||||
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Reported as https://github.com/camelot-dev/camelot/issues/132
|
||||||
|
def test_network_two_tables_b():
|
||||||
|
df1 = pd.DataFrame(data_network_two_tables_b_1)
|
||||||
|
df2 = pd.DataFrame(data_network_two_tables_b_2)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
|
||||||
|
assert len(tables) == 2
|
||||||
|
assert df1.equals(tables[0].df)
|
||||||
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_vertical_header():
|
||||||
|
"""Tests a complex table with a vertically text header.
|
||||||
|
"""
|
||||||
|
df = pd.DataFrame(data_network_vertical_headers)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
assert len(tables) == 1
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_table_regions():
|
||||||
|
df = pd.DataFrame(data_network_table_regions)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
|
# The "stream" test looks for a region in ["320,460,573,335"], which
|
||||||
|
# should exclude the header.
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="network", table_regions=["320,335,573,505"]
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_table_areas():
|
||||||
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="network", table_areas=["320,500,573,335"]
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_columns():
|
||||||
|
df = pd.DataFrame(data_stream_columns)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_split_text():
|
||||||
|
df = pd.DataFrame(data_network_split_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
flavor="network",
|
||||||
|
columns=["72,95,209,327,442,529,566,606,683"],
|
||||||
|
split_text=True,
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_flag_size():
|
||||||
|
df = pd.DataFrame(data_network_flag_size)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "superscript.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_strip_text():
|
||||||
|
df = pd.DataFrame(data_network_strip_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_edge_tol():
|
||||||
|
df = pd.DataFrame(data_network_edge_tol)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_layout_kwargs():
|
||||||
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="network", layout_kwargs={"detect_vertical": False}
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Hybrid parser
|
||||||
|
def test_hybrid():
|
||||||
|
df = pd.DataFrame(data_hybrid)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
def test_hybrid_two_tables():
|
||||||
|
df1 = pd.DataFrame(data_network_two_tables_1)
|
||||||
|
df2 = pd.DataFrame(data_network_two_tables_2)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
|
||||||
|
assert len(tables) == 2
|
||||||
|
assert df1.equals(tables[0].df)
|
||||||
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
def test_hybrid_vertical_header():
|
||||||
|
"""Tests a complex table with a vertically text header.
|
||||||
|
"""
|
||||||
|
df = pd.DataFrame(data_hybrid_vertical_headers)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
assert len(tables) == 1
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hybrid_process_background():
|
||||||
|
df = pd.DataFrame(data_hybrid_process_background)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="hybrid", process_background=True)
|
||||||
|
assert_frame_equal(df, tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hybrid_split_text():
|
||||||
|
df = pd.DataFrame(data_network_split_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
flavor="hybrid",
|
||||||
|
columns=["72,95,209,327,442,529,566,606,683"],
|
||||||
|
split_text=True,
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Lattice parser tests
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
|
|
@ -229,9 +424,9 @@ def test_repr():
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
def test_pages():
|
def test_pages():
|
||||||
|
|
@ -239,22 +434,23 @@ def test_pages():
|
||||||
tables = camelot.read_pdf(url)
|
tables = camelot.read_pdf(url)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="1-end")
|
tables = camelot.read_pdf(url, pages="1-end")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="all")
|
tables = camelot.read_pdf(url, pages="all")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) ==
|
||||||
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -264,7 +460,8 @@ def test_url():
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) ==
|
||||||
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -284,7 +481,12 @@ def test_table_order():
|
||||||
return t
|
return t
|
||||||
|
|
||||||
table_list = TableList(
|
table_list = TableList(
|
||||||
[_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)]
|
[
|
||||||
|
_make_table(2, 1),
|
||||||
|
_make_table(1, 1),
|
||||||
|
_make_table(3, 4),
|
||||||
|
_make_table(1, 2)
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
assert [(t.page, t.order) for t in sorted(table_list)] == [
|
assert [(t.page, t.order) for t in sorted(table_list)] == [
|
||||||
|
|
|
||||||
|
|
@ -14,32 +14,33 @@ filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_flavor():
|
def test_unknown_flavor():
|
||||||
message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
message = ("Unknown flavor specified."
|
||||||
|
" Use either 'lattice', 'stream', or 'network'")
|
||||||
with pytest.raises(NotImplementedError, match=message):
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
tables = camelot.read_pdf(filename, flavor="chocolate")
|
camelot.read_pdf(filename, flavor='chocolate')
|
||||||
|
|
||||||
|
|
||||||
def test_input_kwargs():
|
def test_input_kwargs():
|
||||||
message = "columns cannot be used with flavor='lattice'"
|
message = "columns cannot be used with flavor='lattice'"
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
tables = camelot.read_pdf(filename, columns=["10,20,30,40"])
|
camelot.read_pdf(filename, columns=['10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
def test_unsupported_format():
|
def test_unsupported_format():
|
||||||
message = "File format not supported"
|
message = "File format not supported"
|
||||||
filename = os.path.join(testdir, "foo.csv")
|
filename = os.path.join(testdir, "foo.csv")
|
||||||
with pytest.raises(NotImplementedError, match=message):
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
tables = camelot.read_pdf(filename)
|
camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_equal_length():
|
def test_stream_equal_length():
|
||||||
message = "Length of table_areas and columns" " should be equal"
|
message = "Length of table_areas and columns" " should be equal"
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
tables = camelot.read_pdf(
|
camelot.read_pdf(
|
||||||
filename,
|
filename,
|
||||||
flavor="stream",
|
flavor='stream',
|
||||||
table_areas=["10,20,30,40"],
|
table_areas=['10,20,30,40'],
|
||||||
columns=["10,20,30,40", "10,20,30,40"],
|
columns=['10,20,30,40', '10,20,30,40']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,11 +49,9 @@ def test_image_warning():
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
with pytest.raises(UserWarning) as e:
|
with pytest.raises(UserWarning) as e:
|
||||||
tables = camelot.read_pdf(filename)
|
camelot.read_pdf(filename)
|
||||||
assert (
|
assert str(e.value) == 'page-1 is image-based, camelot only works ' \
|
||||||
str(e.value)
|
'on text-based pages.'
|
||||||
== "page-1 is image-based, camelot only works on text-based pages."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_no_tables_found():
|
||||||
|
|
@ -60,8 +59,8 @@ def test_no_tables_found():
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
with pytest.raises(UserWarning) as e:
|
with pytest.raises(UserWarning) as e:
|
||||||
tables = camelot.read_pdf(filename)
|
camelot.read_pdf(filename)
|
||||||
assert str(e.value) == "No tables found on page-1"
|
assert str(e.value) == 'No tables found on page-1'
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_logs_suppressed():
|
def test_no_tables_found_logs_suppressed():
|
||||||
|
|
@ -70,7 +69,7 @@ def test_no_tables_found_logs_suppressed():
|
||||||
# the test should fail if any warning is thrown
|
# the test should fail if any warning is thrown
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
camelot.read_pdf(filename, suppress_stdout=True)
|
||||||
except Warning as e:
|
except Warning as e:
|
||||||
warning_text = str(e)
|
warning_text = str(e)
|
||||||
pytest.fail(f"Unexpected warning: {warning_text}")
|
pytest.fail(f"Unexpected warning: {warning_text}")
|
||||||
|
|
@ -82,7 +81,7 @@ def test_no_tables_found_warnings_suppressed():
|
||||||
# the test should fail if any warning is thrown
|
# the test should fail if any warning is thrown
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
camelot.read_pdf(filename, suppress_stdout=True)
|
||||||
except Warning as e:
|
except Warning as e:
|
||||||
warning_text = str(e)
|
warning_text = str(e)
|
||||||
pytest.fail(f"Unexpected warning: {warning_text}")
|
pytest.fail(f"Unexpected warning: {warning_text}")
|
||||||
|
|
@ -92,11 +91,11 @@ def test_no_password():
|
||||||
filename = os.path.join(testdir, "health_protected.pdf")
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
message = "file has not been decrypted"
|
message = "file has not been decrypted"
|
||||||
with pytest.raises(Exception, match=message):
|
with pytest.raises(Exception, match=message):
|
||||||
tables = camelot.read_pdf(filename)
|
camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_bad_password():
|
def test_bad_password():
|
||||||
filename = os.path.join(testdir, "health_protected.pdf")
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
message = "file has not been decrypted"
|
message = "file has not been decrypted"
|
||||||
with pytest.raises(Exception, match=message):
|
with pytest.raises(Exception, match=message):
|
||||||
tables = camelot.read_pdf(filename, password="wrongpass")
|
camelot.read_pdf(filename, password='wrongpass')
|
||||||
|
|
|
||||||
|
|
@ -3,58 +3,144 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import matplotlib
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
|
|
||||||
|
# The version of Matplotlib has an impact on some of the tests. Unfortunately,
|
||||||
|
# we can't enforce usage of a recent version of MatplotLib without dropping
|
||||||
|
# support for Python 3.6.
|
||||||
|
# To check the version of matplotlib installed:
|
||||||
|
# pip freeze | grep matplotlib
|
||||||
|
# To force upgrade:
|
||||||
|
# pip install --upgrade --force-reinstall matplotlib
|
||||||
|
# To force usage of a Python 3.6 compatible version:
|
||||||
|
# pip install "matplotlib==3.0.3"
|
||||||
|
# This condition can be removed in favor of a version requirement bump for
|
||||||
|
# matplotlib once support for Python 3.5 is dropped.
|
||||||
|
|
||||||
|
LEGACY_MATPLOTLIB = matplotlib.__version__ < "3.2.1"
|
||||||
|
|
||||||
|
# Bump the default plot tolerance from 2 to account for cross-platform testing
|
||||||
|
# via Travis, and resulting minor font changes.
|
||||||
|
TOLERANCE = 4
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
def unit_test_stable_plot(table, kind):
|
||||||
|
if not LEGACY_MATPLOTLIB:
|
||||||
|
# See https://matplotlib.org/3.2.1/users/whats_new.html#kerning-adjustments-now-use-correct-values # noqa
|
||||||
|
matplotlib.rcParams["text.kerning_factor"] = 6
|
||||||
|
return camelot.plot(table, kind=kind)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_text_plot():
|
def test_text_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind="text")
|
return unit_test_stable_plot(tables[0], 'text')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_grid_plot():
|
def test_grid_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind="grid")
|
return unit_test_stable_plot(tables[0], 'grid')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_stream_grid_plot():
|
||||||
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
return unit_test_stable_plot(tables[0], 'grid')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_network_grid_plot():
|
||||||
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
return unit_test_stable_plot(tables[0], 'grid')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_lattice_contour_plot():
|
def test_lattice_contour_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind="contour")
|
return unit_test_stable_plot(tables[0], 'contour')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_stream_contour_plot():
|
def test_stream_contour_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor='stream')
|
||||||
return camelot.plot(tables[0], kind="contour")
|
return unit_test_stable_plot(tables[0], 'contour')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_network_contour_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor='network')
|
||||||
|
return unit_test_stable_plot(tables[0], 'contour')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_line_plot():
|
def test_line_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind="line")
|
return unit_test_stable_plot(tables[0], 'line')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_joint_plot():
|
def test_joint_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind="joint")
|
return unit_test_stable_plot(tables[0], 'joint')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(
|
||||||
def test_textedge_plot():
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_stream_textedge_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor='stream')
|
||||||
return camelot.plot(tables[0], kind="textedge")
|
return unit_test_stable_plot(tables[0], 'textedge')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_network_textedge_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, debug=True, flavor='network')
|
||||||
|
return unit_test_stable_plot(tables[0], 'textedge')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_network_table_regions_textedge_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, debug=True, flavor="network",
|
||||||
|
table_regions=["320,505,573,330"]
|
||||||
|
)
|
||||||
|
return unit_test_stable_plot(tables[0], 'textedge')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
|
def test_network_table_areas_text_plot():
|
||||||
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, debug=True, flavor="network",
|
||||||
|
table_areas=["320,500,573,335"]
|
||||||
|
)
|
||||||
|
return unit_test_stable_plot(tables[0], 'text')
|
||||||
|
|
|
||||||