Initial Hybrid parser, for now identical to Stream
parent
58823e57e9
commit
d520a77bb7
121
camelot/cli.py
121
camelot/cli.py
|
|
@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)
|
|||
|
||||
@click.group(name="camelot")
|
||||
@click.version_option(version=__version__)
|
||||
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
|
||||
@click.option("-q", "--quiet", is_flag=False,
|
||||
help="Suppress logs and warnings.")
|
||||
@click.option(
|
||||
"-p",
|
||||
"--pages",
|
||||
|
|
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
|
|||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-back", "--process_background", is_flag=True, help="Process background lines."
|
||||
"-back", "--process_background", is_flag=True,
|
||||
help="Process background lines."
|
||||
)
|
||||
@click.option(
|
||||
"-scale",
|
||||
|
|
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
|
|||
"-l",
|
||||
"--line_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
|
||||
help="Tolerance parameter used to merge close vertical"
|
||||
" and horizontal lines.",
|
||||
)
|
||||
@click.option(
|
||||
"-j",
|
||||
|
|
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
|
|||
raise ImportError("matplotlib is required for plotting.")
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError("Please specify output file path using --output")
|
||||
raise click.UsageError(
|
||||
"Please specify output file path using --output")
|
||||
if f is None:
|
||||
raise click.UsageError("Please specify output file format using --format")
|
||||
raise click.UsageError(
|
||||
"Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
||||
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
|
||||
**kwargs
|
||||
)
|
||||
click.echo("Found {} tables".format(tables.n))
|
||||
if plot_type is not None:
|
||||
|
|
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
|
|||
"-r",
|
||||
"--row_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
|
||||
help="Tolerance parameter"
|
||||
" used to combine text vertically, to generate rows.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
|
|
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
|
|||
raise ImportError("matplotlib is required for plotting.")
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError("Please specify output file path using --output")
|
||||
raise click.UsageError(
|
||||
"Please specify output file path using --output")
|
||||
if f is None:
|
||||
raise click.UsageError("Please specify output file format using --format")
|
||||
raise click.UsageError(
|
||||
"Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
||||
|
|
@ -302,3 +311,97 @@ def stream(c, *args, **kwargs):
|
|||
plt.show()
|
||||
else:
|
||||
tables.export(output, f=f, compress=compress)
|
||||
|
||||
|
||||
@cli.command("hybrid")
|
||||
@click.option(
|
||||
"-R",
|
||||
"--table_regions",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Page regions to analyze. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-T",
|
||||
"--table_areas",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Table areas to process. Example: x1,y1,x2,y2"
|
||||
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
|
||||
)
|
||||
@click.option(
|
||||
"-C",
|
||||
"--columns",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="X coordinates of column separators.",
|
||||
)
|
||||
@click.option(
|
||||
"-e",
|
||||
"--edge_tol",
|
||||
default=50,
|
||||
help="Tolerance parameter" " for extending textedges vertically.",
|
||||
)
|
||||
@click.option(
|
||||
"-r",
|
||||
"--row_tol",
|
||||
default=2,
|
||||
help="Tolerance parameter"
|
||||
" used to combine text vertically, to generate rows.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--column_tol",
|
||||
default=0,
|
||||
help="Tolerance parameter"
|
||||
" used to combine text horizontally, to generate columns.",
|
||||
)
|
||||
@click.option(
|
||||
"-plot",
|
||||
"--plot_type",
|
||||
type=click.Choice(["text", "grid", "contour", "textedge"]),
|
||||
help="Plot elements found on PDF page for visual debugging.",
|
||||
)
|
||||
@click.argument("filepath", type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def hybrid(c, *args, **kwargs):
|
||||
"""Use spaces between text to parse the table."""
|
||||
conf = c.config
|
||||
pages = conf.pop("pages")
|
||||
output = conf.pop("output")
|
||||
f = conf.pop("format")
|
||||
compress = conf.pop("zip")
|
||||
quiet = conf.pop("quiet")
|
||||
plot_type = kwargs.pop("plot_type")
|
||||
filepath = kwargs.pop("filepath")
|
||||
kwargs.update(conf)
|
||||
|
||||
table_regions = list(kwargs["table_regions"])
|
||||
kwargs["table_regions"] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs["table_areas"])
|
||||
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||
columns = list(kwargs["columns"])
|
||||
kwargs["columns"] = None if not columns else columns
|
||||
|
||||
if plot_type is not None:
|
||||
if not _HAS_MPL:
|
||||
raise ImportError("matplotlib is required for plotting.")
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError(
|
||||
"Please specify output file path using --output")
|
||||
if f is None:
|
||||
raise click.UsageError(
|
||||
"Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
|
||||
)
|
||||
click.echo("Found {} tables".format(tables.n))
|
||||
if plot_type is not None:
|
||||
for table in tables:
|
||||
plot(table, kind=plot_type)
|
||||
plt.show()
|
||||
else:
|
||||
tables.export(output, f=f, compress=compress)
|
||||
|
|
|
|||
|
|
@ -379,6 +379,8 @@ class Table(object):
|
|||
self._image = None
|
||||
self._image_path = None # Temporary file to hold an image of the pdf
|
||||
|
||||
self._text = [] # List of text box coordinates
|
||||
|
||||
def __repr__(self):
|
||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||
|
||||
|
|
@ -432,11 +434,11 @@ class Table(object):
|
|||
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
||||
|
||||
_text = []
|
||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
|
||||
_text.extend(
|
||||
[(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
|
||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
|
||||
self._text = _text
|
||||
|
||||
|
||||
def get_pdf_image(self):
|
||||
"""Compute pdf image and cache it
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import logging
|
|||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .core import TableList
|
||||
from .parsers import Stream, Lattice
|
||||
from .parsers import Stream, Lattice, Hybrid
|
||||
from .utils import (
|
||||
build_file_path_in_temp_dir,
|
||||
get_page_layout,
|
||||
|
|
@ -21,7 +21,8 @@ logger = logging.getLogger("camelot")
|
|||
|
||||
PARSERS = {
|
||||
"lattice": Lattice,
|
||||
"stream": Stream
|
||||
"stream": Stream,
|
||||
"hybrid": Hybrid,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -173,7 +174,7 @@ class PDFHandler():
|
|||
Parameters
|
||||
----------
|
||||
flavor : str (default: 'lattice')
|
||||
The parsing method to use ('lattice' or 'stream').
|
||||
The parsing method to use ('lattice', 'stream', or 'hybrid').
|
||||
Lattice is used by default.
|
||||
suppress_stdout : str (default: False)
|
||||
Suppress logs and warnings.
|
||||
|
|
|
|||
|
|
@ -99,9 +99,10 @@ def read_pdf(
|
|||
|
||||
"""
|
||||
layout_kwargs = layout_kwargs or {}
|
||||
if flavor not in ["lattice", "stream"]:
|
||||
if flavor not in ["lattice", "stream", "hybrid"]:
|
||||
raise NotImplementedError(
|
||||
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||
"Unknown flavor specified."
|
||||
" Use either 'lattice', 'stream', or 'hybrid'"
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
|
|
|
|||
|
|
@ -2,3 +2,4 @@
|
|||
|
||||
from .stream import Stream
|
||||
from .lattice import Lattice
|
||||
from .hybrid import Hybrid
|
||||
|
|
|
|||
|
|
@ -0,0 +1,441 @@
|
|||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import division
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import TextEdges
|
||||
from ..utils import (text_in_bbox, text_in_bbox_per_axis)
|
||||
|
||||
|
||||
class Hybrid(BaseParser):
|
||||
"""Hybrid method of parsing looks for spaces between text
|
||||
to parse the table.
|
||||
|
||||
If you want to specify columns when specifying multiple table
|
||||
areas, make sure that the length of both lists are equal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_regions : list, optional (default: None)
|
||||
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
table_areas : list, optional (default: None)
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
columns : list, optional (default: None)
|
||||
List of column x-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
edge_tol : int, optional (default: 50)
|
||||
Tolerance parameter for extending textedges vertically.
|
||||
row_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to combine text vertically,
|
||||
to generate rows.
|
||||
column_tol : int, optional (default: 0)
|
||||
Tolerance parameter used to combine text horizontally,
|
||||
to generate columns.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
"hybrid",
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
flag_size=flag_size,
|
||||
)
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
self.edge_tol = edge_tol
|
||||
self.row_tol = row_tol
|
||||
self.column_tol = column_tol
|
||||
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||
|
||||
"""
|
||||
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
|
||||
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
|
||||
text_bbox = (xmin, ymin, xmax, ymax)
|
||||
return text_bbox
|
||||
|
||||
@staticmethod
|
||||
def _group_rows(text, row_tol=2):
|
||||
"""Groups PDFMiner text objects into rows vertically
|
||||
within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
row_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
|
||||
"""
|
||||
row_y = None
|
||||
rows = []
|
||||
temp = []
|
||||
non_empty_text = [t for t in text if t.get_text().strip()]
|
||||
for t in non_empty_text:
|
||||
# is checking for upright necessary?
|
||||
# if t.get_text().strip() and all([obj.upright \
|
||||
# for obj in t._objs
|
||||
# if type(obj) is LTChar]):
|
||||
if row_y is None:
|
||||
row_y = t.y0
|
||||
elif not np.isclose(row_y, t.y0, atol=row_tol):
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
temp = []
|
||||
# We update the row's bottom as we go, to be forgiving if there
|
||||
# is a gradual change across multiple columns.
|
||||
row_y = t.y0
|
||||
temp.append(t)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
def _merge_columns(l, column_tol=0):
|
||||
"""Merges column boundaries horizontally if they overlap
|
||||
or lie within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
l : list
|
||||
List of column x-coordinate tuples.
|
||||
column_tol : int, optional (default: 0)
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
List of merged column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if column_tol >= 0:
|
||||
if higher[0] <= lower[1] or np.isclose(
|
||||
higher[0], lower[1], atol=column_tol
|
||||
):
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
elif column_tol < 0:
|
||||
if higher[0] <= lower[1]:
|
||||
if np.isclose(higher[0], lower[1],
|
||||
atol=abs(column_tol)):
|
||||
merged.append(higher)
|
||||
else:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||
"""Makes row coordinates continuous. For the row to "touch"
|
||||
we split the existing gap between them in half.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rows_grouped : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
text_y_max : int
|
||||
text_y_min : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
List of continuous row y-coordinate tuples.
|
||||
|
||||
"""
|
||||
row_boundaries = [
|
||||
[
|
||||
max(t.y1 for t in r),
|
||||
min(t.y0 for t in r)
|
||||
]
|
||||
for r in rows_grouped
|
||||
]
|
||||
for i in range(0, len(row_boundaries)-1):
|
||||
top_row = row_boundaries[i]
|
||||
bottom_row = row_boundaries[i+1]
|
||||
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
||||
row_boundaries[0][0] = text_y_max
|
||||
row_boundaries[-1][1] = text_y_min
|
||||
return row_boundaries
|
||||
|
||||
@staticmethod
|
||||
def _add_columns(cols, text, row_tol):
|
||||
"""Adds columns to existing list by taking into account
|
||||
the text that lies outside the current column x-coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
ytol : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
if text:
|
||||
text = Hybrid._group_rows(text, row_tol=row_tol)
|
||||
elements = [len(r) for r in text]
|
||||
new_cols = [
|
||||
(t.x0, t.x1)
|
||||
for r in text if len(r) == max(elements)
|
||||
for t in r
|
||||
]
|
||||
cols.extend(Hybrid._merge_columns(sorted(new_cols)))
|
||||
return cols
|
||||
|
||||
@staticmethod
|
||||
def _join_columns(cols, text_x_min, text_x_max):
|
||||
"""Makes column coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text_x_min : int
|
||||
text_y_max : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
return cols
|
||||
|
||||
def _validate_columns(self):
|
||||
if self.table_areas is not None and self.columns is not None:
|
||||
if len(self.table_areas) != len(self.columns):
|
||||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
def _nurminen_table_detection(self, textlines):
|
||||
"""A general implementation of the table detection algorithm
|
||||
described by Anssi Nurminen's master's thesis.
|
||||
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
|
||||
|
||||
Assumes that tables are situated relatively far apart
|
||||
vertically.
|
||||
"""
|
||||
# TODO: add support for arabic text #141
|
||||
# sort textlines in reading order
|
||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||
textedges = TextEdges(edge_tol=self.edge_tol)
|
||||
# generate left, middle and right textedges
|
||||
textedges.generate(textlines)
|
||||
# select relevant edges
|
||||
relevant_textedges = textedges.get_relevant()
|
||||
self.textedges.extend(relevant_textedges)
|
||||
# guess table areas using textlines and relevant edges
|
||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||
# treat whole page as table area if no table areas found
|
||||
if not table_bbox:
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
|
||||
return table_bbox
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
self.textedges = []
|
||||
if self.table_areas is None:
|
||||
hor_text = self.horizontal_text
|
||||
if self.table_regions is not None:
|
||||
# filter horizontal text
|
||||
hor_text = []
|
||||
for region in self.table_regions:
|
||||
x1, y1, x2, y2 = region.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
region_text = text_in_bbox(
|
||||
(x1, y2, x2, y1), self.horizontal_text)
|
||||
hor_text.extend(region_text)
|
||||
# find tables based on nurminen's detection algorithm
|
||||
table_bbox = self._nurminen_table_detection(hor_text)
|
||||
else:
|
||||
table_bbox = {}
|
||||
for area in self.table_areas:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
self.table_bbox = table_bbox
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
tk,
|
||||
self.horizontal_text,
|
||||
self.vertical_text
|
||||
)
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
||||
self._text_bbox(self.t_bbox)
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
# calculate mode of the list of number of elements in
|
||||
# each row to guess the number of columns
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
if ncols == 1:
|
||||
# if mode is 1, the page usually contains not tables
|
||||
# but there can be cases where the list can be skewed,
|
||||
# try to remove all 1s from list in this case and
|
||||
# see if the list contains elements, if yes, then use
|
||||
# the mode after removing 1s
|
||||
elements = list(filter(lambda x: x != 1, elements))
|
||||
if elements:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found in table area {}"
|
||||
.format(table_idx + 1)
|
||||
)
|
||||
cols = [
|
||||
(t.x0, t.x1)
|
||||
for r in rows_grouped
|
||||
if len(r) == ncols
|
||||
for t in r
|
||||
]
|
||||
cols = self._merge_columns(
|
||||
sorted(cols),
|
||||
column_tol=self.column_tol
|
||||
)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend(
|
||||
[
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right
|
||||
]
|
||||
)
|
||||
outer_text = [
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||
]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
return cols, rows
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
table = table.set_all_edges()
|
||||
table.record_parse_metadata(self)
|
||||
|
||||
# for plotting
|
||||
table._bbox = self.table_bbox
|
||||
table._segments = None
|
||||
table._textedges = self.textedges
|
||||
|
||||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
if self._document_has_no_text():
|
||||
return []
|
||||
|
||||
# Identify plausible areas within the doc where tables lie,
|
||||
# populate table_bbox keys with these areas.
|
||||
self._generate_table_bbox()
|
||||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, bbox in enumerate(
|
||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||
):
|
||||
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
|
||||
table = self._generate_table(table_idx, cols, rows)
|
||||
table._bbox = bbox
|
||||
_tables.append(table)
|
||||
|
||||
return _tables
|
||||
|
|
@ -252,7 +252,6 @@ class Lattice(BaseParser):
|
|||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||
)
|
||||
|
||||
|
||||
def _generate_columns_and_rows(self, tk):
|
||||
# select elements which lie within table_bbox
|
||||
v_s, h_s = segments_in_bbox(
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ class PlotMethods(object):
|
|||
raise NotImplementedError(
|
||||
"Lattice flavor does not support kind='{}'".format(kind)
|
||||
)
|
||||
elif table.flavor == "stream" and kind in ["line"]:
|
||||
elif table.flavor in ["stream", "hybrid"] and kind in ["line"]:
|
||||
raise NotImplementedError(
|
||||
"Stream flavor does not support kind='{}'".format(kind)
|
||||
)
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 49 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 113 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 113 KiB |
|
|
@ -72,6 +72,26 @@ def test_cli_stream():
|
|||
assert format_error in result.output
|
||||
|
||||
|
||||
def test_cli_hybrid():
|
||||
with TemporaryDirectory() as tempdir:
|
||||
infile = os.path.join(testdir, "budget.pdf")
|
||||
outfile = os.path.join(tempdir, "budget.csv")
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert result.output == "Found 1 tables\n"
|
||||
|
||||
result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
|
||||
output_error = "Error: Please specify output file path using --output"
|
||||
assert output_error in result.output
|
||||
|
||||
result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
|
||||
format_error = "Please specify output file format using --format"
|
||||
assert format_error in result.output
|
||||
|
||||
|
||||
def test_cli_password():
|
||||
with TemporaryDirectory() as tempdir:
|
||||
infile = os.path.join(testdir, "health_protected.pdf")
|
||||
|
|
|
|||
|
|
@ -148,6 +148,115 @@ def test_stream_layout_kwargs():
|
|||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid():
|
||||
df = pd.DataFrame(data_stream)
|
||||
|
||||
filename = os.path.join(testdir, "health.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_table_rotated():
|
||||
df = pd.DataFrame(data_stream_table_rotated)
|
||||
|
||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_two_tables():
|
||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_hybrid_table_regions():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="hybrid", table_regions=["320,460,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_table_areas():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="hybrid", table_areas=["320,500,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_columns():
|
||||
df = pd.DataFrame(data_stream_columns)
|
||||
|
||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_split_text():
|
||||
df = pd.DataFrame(data_stream_split_text)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename,
|
||||
flavor="hybrid",
|
||||
columns=["72,95,209,327,442,529,566,606,683"],
|
||||
split_text=True,
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_flag_size():
|
||||
df = pd.DataFrame(data_stream_flag_size)
|
||||
|
||||
filename = os.path.join(testdir, "superscript.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_strip_text():
|
||||
df = pd.DataFrame(data_stream_strip_text)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_edge_tol():
|
||||
df = pd.DataFrame(data_stream_edge_tol)
|
||||
|
||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_layout_kwargs():
|
||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,16 @@ def test_stream_grid_plot():
|
|||
return camelot.plot(tables[0], kind='grid')
|
||||
|
||||
|
||||
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
|
||||
reason="depends on a recent version of MatPlotLib")
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_hybrid_grid_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
return camelot.plot(tables[0], kind='grid')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_lattice_contour_plot():
|
||||
|
|
@ -73,6 +83,16 @@ def test_stream_contour_plot():
|
|||
return camelot.plot(tables[0], kind='contour')
|
||||
|
||||
|
||||
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
|
||||
reason="depends on a recent version of MatPlotLib")
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_hybrid_contour_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='hybrid')
|
||||
return camelot.plot(tables[0], kind='contour')
|
||||
|
||||
|
||||
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
|
||||
reason="depends on a recent version of MatPlotLib")
|
||||
@pytest.mark.mpl_image_compare(
|
||||
|
|
@ -97,7 +117,17 @@ def test_joint_plot():
|
|||
reason="depends on a recent version of MatPlotLib")
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_textedge_plot():
|
||||
def test_stream_textedge_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='stream')
|
||||
return camelot.plot(tables[0], kind='textedge')
|
||||
|
||||
|
||||
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
|
||||
reason="depends on a recent version of MatPlotLib")
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_hybrid_textedge_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='hybrid')
|
||||
return camelot.plot(tables[0], kind='textedge')
|
||||
|
|
|
|||
Loading…
Reference in New Issue