pull/153/merge
FrancoisHuet 2020-07-21 13:15:53 +02:00 committed by GitHub
commit d392000a5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 5180 additions and 1034 deletions

View File

@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class Config(object): class Config():
def __init__(self): def __init__(self):
self.config = {} self.config = {}
@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)
@click.group(name="camelot") @click.group(name="camelot")
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.") @click.option("-q", "--quiet", is_flag=False,
help="Suppress logs and warnings.")
@click.option( @click.option(
"-p", "-p",
"--pages", "--pages",
@ -57,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
"-flag", "-flag",
"--flag_size", "--flag_size",
is_flag=True, is_flag=True,
help="Flag text based on" " font size. Useful to detect super/subscripts.", help="Flag text based on font size. Useful to detect super/subscripts.",
) )
@click.option( @click.option(
"-strip", "-strip",
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
" where x1, y1 -> left-top and x2, y2 -> right-bottom.", " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
) )
@click.option( @click.option(
"-back", "--process_background", is_flag=True, help="Process background lines." "-back", "--process_background", is_flag=True,
help="Process background lines."
) )
@click.option( @click.option(
"-scale", "-scale",
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
"-l", "-l",
"--line_tol", "--line_tol",
default=2, default=2,
help="Tolerance parameter used to merge close vertical" " and horizontal lines.", help="Tolerance parameter used to merge close vertical"
" and horizontal lines.",
) )
@click.option( @click.option(
"-j", "-j",
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
raise ImportError("matplotlib is required for plotting.") raise ImportError("matplotlib is required for plotting.")
else: else:
if output is None: if output is None:
raise click.UsageError("Please specify output file path using --output") raise click.UsageError(
"Please specify output file path using --output")
if f is None: if f is None:
raise click.UsageError("Please specify output file format using --format") raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf( tables = read_pdf(
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
**kwargs
) )
click.echo(f"Found {tables.n} tables") click.echo(f"Found {tables.n} tables")
if plot_type is not None: if plot_type is not None:
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
"-r", "-r",
"--row_tol", "--row_tol",
default=2, default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.", help="Tolerance parameter"
" used to combine text vertically, to generate rows.",
) )
@click.option( @click.option(
"-c", "-c",
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
raise ImportError("matplotlib is required for plotting.") raise ImportError("matplotlib is required for plotting.")
else: else:
if output is None: if output is None:
raise click.UsageError("Please specify output file path using --output") raise click.UsageError(
"Please specify output file path using --output")
if f is None: if f is None:
raise click.UsageError("Please specify output file format using --format") raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf( tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
@ -302,3 +311,98 @@ def stream(c, *args, **kwargs):
plt.show() plt.show()
else: else:
tables.export(output, f=f, compress=compress) tables.export(output, f=f, compress=compress)
@cli.command("network")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter"
" used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def network(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns
if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError(
"Please specify output file path using --output")
if f is None:
raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="network",
suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)

View File

@ -4,12 +4,20 @@ import os
import sqlite3 import sqlite3
import zipfile import zipfile
import tempfile import tempfile
from itertools import chain
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from cv2 import cv2
from .utils import (
get_index_closest_point,
get_textline_coords,
build_file_path_in_temp_dir,
export_pdf_as_png
)
# minimum number of vertical textline intersections for a textedge # minimum number of vertical textline intersections for a textedge
# to be considered valid # to be considered valid
@ -18,14 +26,70 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
TABLE_AREA_PADDING = 10 TABLE_AREA_PADDING = 10
class TextEdge(object): HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
"""Defines a text edge coordinates relative to a left-bottom VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
origin. (PDF coordinate space) ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class TextAlignment():
"""Represents a list of textlines sharing an alignment on a coordinate.
The alignment can be left/right/middle or top/bottom/center.
(PDF coordinate space)
Parameters Parameters
---------- ----------
x : float coord : float
x-coordinate of the text edge. coordinate of the initial text edge. Depending on the alignment
it could be a vertical or horizontal coordinate.
textline : obj
the original textline to start the alignment
align : str
Name of the alignment (e.g. "left", "top", etc)
Attributes
----------
coord : float
The coordinate aligned averaged out across textlines. It can be along
the x or y axis.
textlines : array
Array of textlines that demonstrate this alignment.
align : str
Name of the alignment (e.g. "left", "top", etc)
"""
def __init__(self, coord, textline, align):
self.coord = coord
self.textlines = [textline]
self.align = align
def __repr__(self):
text_inside = " | ".join(
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
f"textlines text='{text_inside}...'>"
def register_aligned_textline(self, textline, coord):
"""Update new textline to this alignment, adapting its average."""
# Increase the intersections for this segment, expand it up,
# and adjust the x based on the new value
self.coord = (self.coord * len(self.textlines) + coord) / \
float(len(self.textlines) + 1)
self.textlines.append(textline)
class TextEdge(TextAlignment):
"""Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space).
An edge is an alignment bounded over a segment.
Parameters
----------
coord : float
coordinate of the text edge. Can be x or y.
y0 : float y0 : float
y-coordinate of bottommost point. y-coordinate of bottommost point.
y1 : float y1 : float
@ -35,101 +99,120 @@ class TextEdge(object):
Attributes Attributes
---------- ----------
intersections: int
Number of intersections with horizontal text rows.
is_valid: bool is_valid: bool
A text edge is valid if it intersections with at least A text edge is valid if it intersects with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
""" """
def __init__(self, x, y0, y1, align="left"): def __init__(self, coord, textline, align):
self.x = x super().__init__(coord, textline, align)
self.y0 = y0 self.y0 = textline.y0
self.y1 = y1 self.y1 = textline.y1
self.align = align
self.intersections = 0
self.is_valid = False self.is_valid = False
def __repr__(self): def __repr__(self):
x = round(self.x, 2) x = round(self.coord, 2)
y0 = round(self.y0, 2) y0 = round(self.y0, 2)
y1 = round(self.y1, 2) y1 = round(self.y1, 2)
return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} valid={self.is_valid}>" return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} " \
f"valid={self.is_valid}>"
def update_coords(self, x, y0, edge_tol=50): def update_coords(self, x, textline, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets """Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_tol): if np.isclose(self.y0, textline.y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.register_aligned_textline(textline, x)
self.y0 = y0 self.y0 = textline.y0
self.intersections += 1
# a textedge is valid only if it extends uninterrupted # a textedge is valid only if it extends uninterrupted
# over a required number of textlines # over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True self.is_valid = True
class TextEdges(object): class TextAlignments():
"""Defines a dict of text edges across reference alignments.
"""
def __init__(self, alignment_names):
# For each possible alignment, list of tuples coordinate/textlines
self._text_alignments = {}
for alignment_name in alignment_names:
self._text_alignments[alignment_name] = []
@staticmethod
def _create_new_text_alignment(coord, textline, align):
return TextAlignment(coord, textline, align)
def _update_alignment(self, alignment, coord, textline):
return NotImplemented
def _register_textline(self, textline):
"""Updates an existing text edge in the current dict.
"""
coords = get_textline_coords(textline)
for alignment_id, alignment_array in self._text_alignments.items():
coord = coords[alignment_id]
# Find the index of the closest existing element (or 0 if none)
idx_closest = get_index_closest_point(
coord, alignment_array, fn=lambda x: x.coord
)
# Check if the edges before/after are close enough
# that it can be considered aligned
idx_insert = None
if idx_closest is None:
idx_insert = 0
else:
coord_closest = alignment_array[idx_closest].coord
# Note: np.isclose is slow!
if coord - 0.5 < coord_closest < coord + 0.5:
self._update_alignment(
alignment_array[idx_closest],
coord,
textline
)
elif coord_closest < coord:
idx_insert = idx_closest + 1
else:
idx_insert = idx_closest
if idx_insert is not None:
new_alignment = self._create_new_text_alignment(
coord, textline, alignment_id
)
alignment_array.insert(idx_insert, new_alignment)
class TextEdges(TextAlignments):
"""Defines a dict of left, right and middle text edges found on """Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
""" """
def __init__(self, edge_tol=50): def __init__(self, edge_tol=50):
super().__init__(HORIZONTAL_ALIGNMENTS)
self.edge_tol = edge_tol self.edge_tol = edge_tol
self._textedges = {"left": [], "right": [], "middle": []}
@staticmethod @staticmethod
def get_x_coord(textline, align): def _create_new_text_alignment(coord, textline, align):
"""Returns the x coordinate of a text row based on the # In TextEdges, each alignment is a TextEdge
specified alignment. return TextEdge(coord, textline, align)
"""
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
return x_coord[align]
def find(self, x_coord, align): def add(self, coord, textline, align):
"""Returns the index of an existing text edge using """Adds a new text edge to the current dict."""
the specified x coordinate and alignment. te = self._create_new_text_alignment(coord, textline, align)
""" self._text_alignments[align].append(te)
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord, atol=0.5):
return i
return None
def add(self, textline, align): def _update_alignment(self, alignment, coord, textline):
"""Adds a new text edge to the current dict. alignment.update_coords(coord, textline, self.edge_tol)
"""
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def update(self, textline):
"""Updates an existing text edge in the current dict.
"""
for align in ["left", "right", "middle"]:
x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align)
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol
)
def generate(self, textlines): def generate(self, textlines):
"""Generates the text edges dict based on horizontal text """Generates the text edges dict based on horizontal text rows."""
rows.
"""
for tl in textlines: for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl) self._register_textline(tl)
def get_relevant(self): def get_relevant(self):
"""Returns the list of relevant text edges (all share the same """Returns the list of relevant text edges (all share the same
@ -138,13 +221,16 @@ class TextEdges(object):
""" """
intersections_sum = { intersections_sum = {
"left": sum( "left": sum(
te.intersections for te in self._textedges["left"] if te.is_valid len(te.textlines) for te in self._text_alignments["left"]
if te.is_valid
), ),
"right": sum( "right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid len(te.textlines) for te in self._text_alignments["right"]
if te.is_valid
), ),
"middle": sum( "middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid len(te.textlines) for te in self._text_alignments["middle"]
if te.is_valid
), ),
} }
@ -152,7 +238,10 @@ class TextEdges(object):
# get vertical textedges that intersect maximum number of # get vertical textedges that intersect maximum number of
# times with horizontal textlines # times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align] return list(filter(
lambda te: te.is_valid,
self._text_alignments[relevant_align])
)
def get_table_areas(self, textlines, relevant_textedges): def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page """Returns a dict of interesting table areas on the PDF page
@ -168,13 +257,12 @@ class TextEdges(object):
return (x0, y0, x1, y1) return (x0, y0, x1, y1)
# sort relevant textedges in reading order # sort relevant textedges in reading order
relevant_textedges.sort(key=lambda te: (-te.y0, te.x)) relevant_textedges.sort(key=lambda te: (-te.y0, te.coord))
table_areas = {} table_areas = {}
for te in relevant_textedges: for te in relevant_textedges:
if te.is_valid:
if not table_areas: if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
else: else:
found = None found = None
for area in table_areas: for area in table_areas:
@ -183,13 +271,13 @@ class TextEdges(object):
found = area found = area
break break
if found is None: if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
else: else:
table_areas.pop(found) table_areas.pop(found)
updated_area = ( updated_area = (
found[0], found[0],
min(te.y0, found[1]), min(te.y0, found[1]),
max(found[2], te.x), max(found[2], te.coord),
max(found[3], te.y1), max(found[3], te.y1),
) )
table_areas[updated_area] = None table_areas[updated_area] = None
@ -218,7 +306,8 @@ class TextEdges(object):
max(found[3], tl.y1), max(found[3], tl.y1),
) )
table_areas[updated_area] = None table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines)) average_textline_height = sum_textline_height / \
float(len(textlines))
# add some padding to table areas # add some padding to table areas
table_areas_padded = {} table_areas_padded = {}
@ -228,7 +317,7 @@ class TextEdges(object):
return table_areas_padded return table_areas_padded
class Cell(object): class Cell():
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space) left-bottom origin. (PDF coordinate space)
@ -304,14 +393,13 @@ class Cell(object):
@property @property
def bound(self): def bound(self):
"""The number of sides on which the cell is bounded. """The number of sides on which the cell is bounded."""
"""
return self.top + self.bottom + self.left + self.right return self.top + self.bottom + self.left + self.right
class Table(object): class Table():
"""Defines a table with coordinates relative to a left-bottom """Defines a table with coordinates relative to a left-bottom origin.
origin. (PDF coordinate space) (PDF coordinate space)
Parameters Parameters
---------- ----------
@ -331,6 +419,8 @@ class Table(object):
Accuracy with which text was assigned to the cell. Accuracy with which text was assigned to the cell.
whitespace : float whitespace : float
Percentage of whitespace in the table. Percentage of whitespace in the table.
filename : str
Path of the original PDF
order : int order : int
Table number on PDF page. Table number on PDF page.
page : int page : int
@ -341,13 +431,27 @@ class Table(object):
def __init__(self, cols, rows): def __init__(self, cols, rows):
self.cols = cols self.cols = cols
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] self.cells = [
[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
]
self.df = None self.df = None
self.shape = (0, 0) self.shape = (0, 0)
self.accuracy = 0 self.accuracy = 0
self.whitespace = 0 self.whitespace = 0
self.filename = None
self.order = None self.order = None
self.page = None self.page = None
self.flavor = None # Flavor of the parser used
self.pdf_size = None # Dimensions of the original PDF page
self._bbox = None # Bounding box in original document
self.parse = None # Parse information
self.parse_details = None # Field holding extra debug data
self._image = None
self._image_path = None # Temporary file to hold an image of the pdf
self._text = [] # List of text box coordinates
self.textlines = [] # List of actual textlines on the page
def __repr__(self): def __repr__(self):
return f"<{self.__class__.__name__} shape={self.shape}>" return f"<{self.__class__.__name__} shape={self.shape}>"
@ -356,8 +460,7 @@ class Table(object):
if self.page == other.page: if self.page == other.page:
if self.order < other.order: if self.order < other.order:
return True return True
if self.page < other.page: return self.page < other.page
return True
@property @property
def data(self): def data(self):
@ -382,6 +485,19 @@ class Table(object):
} }
return report return report
def get_pdf_image(self):
"""Compute pdf image and cache it
"""
if self._image is None:
if self._image_path is None:
self._image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
return self._image
def set_all_edges(self): def set_all_edges(self):
"""Sets all table edges to True. """Sets all table edges to True.
""" """
@ -548,7 +664,7 @@ class Table(object):
bottom = cell.bottom bottom = cell.bottom
if cell.bound == 4: if cell.bound == 4:
continue continue
elif cell.bound == 3: if cell.bound == 3:
if not left and (right and top and bottom): if not left and (right and top and bottom):
cell.hspan = True cell.hspan = True
elif not right and (left and top and bottom): elif not right and (left and top and bottom):
@ -578,7 +694,8 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} kw = {"encoding": "utf-8", "index": False, "header": False,
"quoting": 1}
kw.update(kwargs) kw.update(kwargs)
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
@ -615,6 +732,7 @@ class Table(object):
"encoding": "utf-8", "encoding": "utf-8",
} }
kw.update(kwargs) kw.update(kwargs)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(path) writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw) self.df.to_excel(writer, **kw)
writer.save() writer.save()
@ -653,8 +771,41 @@ class Table(object):
conn.commit() conn.commit()
conn.close() conn.close()
def copy_spanning_text(self, copy_text=None):
"""Copies over text in empty spanning cells.
class TableList(object): Parameters
----------
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
"""
for f in copy_text:
if f == "h":
for i, row in enumerate(self.cells):
for j, cell in enumerate(row):
if cell.text.strip() == "" and \
cell.hspan and \
not cell.left:
cell.text = self.cells[i][j - 1].text
elif f == "v":
for i, row in enumerate(self.cells):
for j, cell in enumerate(row):
if cell.text.strip() == "" and \
cell.vspan and \
not cell.top:
cell.text = self.cells[i - 1][j].text
return self
class TableList():
"""Defines a list of camelot.core.Table objects. Each table can """Defines a list of camelot.core.Table objects. Each table can
be accessed using its index. be accessed using its index.
@ -734,10 +885,15 @@ class TableList(object):
self._compress_dir(**kwargs) self._compress_dir(**kwargs)
elif f == "excel": elif f == "excel":
filepath = os.path.join(dirname, basename) filepath = os.path.join(dirname, basename)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(filepath) writer = pd.ExcelWriter(filepath)
for table in self._tables: for table in self._tables:
sheet_name = f"page-{table.page}-table-{table.order}" sheet_name = f"page-{table.page}-table-{table.order}"
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") table.df.to_excel(
writer,
sheet_name=sheet_name,
encoding="utf-8"
)
writer.save() writer.save()
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + ".zip" zipname = os.path.join(os.path.dirname(path), root) + ".zip"

View File

@ -2,13 +2,14 @@
import os import os
import sys import sys
import logging
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice, Network, Hybrid
from .utils import ( from .utils import (
TemporaryDirectory, build_file_path_in_temp_dir,
get_page_layout, get_page_layout,
get_text_objects, get_text_objects,
get_rotation, get_rotation,
@ -16,8 +17,17 @@ from .utils import (
download_url, download_url,
) )
logger = logging.getLogger("camelot")
class PDFHandler(object): PARSERS = {
"lattice": Lattice,
"stream": Stream,
"network": Network,
"hybrid": Hybrid,
}
class PDFHandler():
"""Handles all operations like temp directory creation, splitting """Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the file into single page PDFs, parsing each PDF and then removing the
temp directory. temp directory.
@ -31,10 +41,13 @@ class PDFHandler(object):
Example: '1,3,4' or '1,4-end' or 'all'. Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None) password : str, optional (default: None)
Password for decryption. Password for decryption.
debug : bool, optional (default: False)
Whether the parser should store debug information during parsing.
""" """
def __init__(self, filepath, pages="1", password=None): def __init__(self, filepath, pages="1", password=None, debug=False):
self.debug = debug
if is_url(filepath): if is_url(filepath):
filepath = download_url(filepath) filepath = download_url(filepath)
self.filepath = filepath self.filepath = filepath
@ -89,38 +102,54 @@ class PDFHandler(object):
P.extend(range(p["start"], p["end"] + 1)) P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filepath, page, temp): def _read_pdf_page(self, page=1, layout_kwargs=None):
"""Saves specified page from PDF into a temporary directory. """Saves specified page from PDF into a temporary directory. Removes
password protection and normalizes rotation.
Parameters Parameters
---------- ----------
filepath : str
Filepath or URL of the PDF file.
page : int page : int
Page number. Page number.
temp : str layout_kwargs : dict, optional (default: {})
Tmp directory. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
Returns
-------
layout : object
dimensions : tuple
The dimensions of the pdf page
filepath : str
The path of the single page PDF - either the original, or a
normalized version.
""" """
with open(filepath, "rb") as fileobj: layout_kwargs = layout_kwargs or {}
with open(self.filepath, "rb") as fileobj:
# Normalize the pdf file, but skip if it's not encrypted or has
# only one page.
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf") fpath = build_file_path_in_temp_dir(f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
# fix rotated PDF # fix rotated PDF
chars = get_text_objects(layout, ltype="char") chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "": if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False) infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
@ -134,9 +163,13 @@ class PDFHandler(object):
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
return layout, dimensions, fpath
def parse( def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs self, flavor="lattice", suppress_stdout=False,
layout_kwargs=None, **kwargs
): ):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page PDFs. page PDFs.
@ -144,12 +177,13 @@ class PDFHandler(object):
Parameters Parameters
---------- ----------
flavor : str (default: 'lattice') flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream'). The parsing method to use ('lattice', 'stream', 'network',
or 'hybrid').
Lattice is used by default. Lattice is used by default.
suppress_stdout : str (default: False) suppress_stdout : str (default: False)
Suppress logs and warnings. Suppress logs and warnings.
layout_kwargs : dict, optional (default: {}) layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
kwargs : dict kwargs : dict
See camelot.read_pdf kwargs. See camelot.read_pdf kwargs.
@ -159,17 +193,24 @@ class PDFHandler(object):
List of tables found in PDF. List of tables found in PDF.
""" """
layout_kwargs = layout_kwargs or {}
tables = [] tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages: parser_obj = PARSERS[flavor]
self._save_page(self.filepath, p, tempdir) parser = parser_obj(debug=self.debug, **kwargs)
pages = [
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages # Read the layouts/dimensions of each of the pages we need to
] # parse. This might require creating a temporary .pdf.
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for page_idx in self.pages:
for p in pages: layout, dimensions, source_file = self._read_pdf_page(
t = parser.extract_tables( page_idx,
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs layout_kwargs=layout_kwargs
) )
parser.prepare_page_parse(source_file, layout, dimensions,
page_idx, layout_kwargs)
if not suppress_stdout:
rootname = os.path.basename(parser.rootname)
logger.info(f"Processing {rootname}")
t = parser.extract_tables()
tables.extend(t) tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -6,7 +6,6 @@ import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
---------- ----------
imagename : string imagename : string
@ -16,21 +15,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
blocksize : int, optional (default: 15) blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
c : int, optional (default: -2) c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
Returns Returns
------- -------
img : object img : object
numpy.ndarray representing the original image. numpy.ndarray representing the original image.
threshold : object threshold : object
numpy.ndarray representing the thresholded image. numpy.ndarray representing the thresholded image.
""" """
img = cv2.imread(imagename) img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@ -56,7 +51,6 @@ def find_lines(
): ):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
Parameters Parameters
---------- ----------
threshold : object threshold : object
@ -70,14 +64,11 @@ def find_lines(
line_scale : int, optional (default: 15) line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected. smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines. too large will lead to text being detected as lines.
iterations : int, optional (default: 0) iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
Returns Returns
------- -------
dmask : object dmask : object
@ -87,7 +78,6 @@ def find_lines(
List of tuples representing vertical/horizontal lines with List of tuples representing vertical/horizontal lines with
coordinates relative to a left-top origin in coordinates relative to a left-top origin in
image coordinate space. image coordinate space.
""" """
lines = [] lines = []
@ -135,21 +125,18 @@ def find_lines(
def find_contours(vertical, horizontal): def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours. """Finds table boundaries using OpenCV's findContours.
Parameters Parameters
---------- ----------
vertical : object vertical : object
numpy.ndarray representing pixels where vertical lines lie. numpy.ndarray representing pixels where vertical lines lie.
horizontal : object horizontal : object
numpy.ndarray representing pixels where horizontal lines lie. numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
cont : list cont : list
List of tuples representing table boundaries. Each tuple is of List of tuples representing table boundaries. Each tuple is of
the form (x, y, w, h) where (x, y) -> left-top, w -> width and the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space. h -> height in image coordinate space.
""" """
mask = vertical + horizontal mask = vertical + horizontal
@ -175,7 +162,6 @@ def find_contours(vertical, horizontal):
def find_joints(contours, vertical, horizontal): def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary. """Finds joints/intersections present inside each table boundary.
Parameters Parameters
---------- ----------
contours : list contours : list
@ -186,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
numpy.ndarray representing pixels where vertical lines lie. numpy.ndarray representing pixels where vertical lines lie.
horizontal : object horizontal : object
numpy.ndarray representing pixels where horizontal lines lie. numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
tables : dict tables : dict
@ -194,7 +179,6 @@ def find_joints(contours, vertical, horizontal):
in that boundary as their value. in that boundary as their value.
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
and (x2, y2) -> rt in image coordinate space. and (x2, y2) -> rt in image coordinate space.
""" """
joints = np.multiply(vertical, horizontal) joints = np.multiply(vertical, horizontal)
tables = {} tables = {}

View File

@ -12,9 +12,9 @@ def read_pdf(
password=None, password=None,
flavor="lattice", flavor="lattice",
suppress_stdout=False, suppress_stdout=False,
layout_kwargs={}, layout_kwargs=None,
**kwargs debug=False,
): **kwargs):
"""Read PDF and return extracted tables. """Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream' Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -80,16 +80,16 @@ def read_pdf(
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
threshold_constant* : int, optional (default: -2) threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
iterations* : int, optional (default: 0) iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
resolution* : int, optional (default: 300) resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion. Resolution used for PDF to PNG conversion.
@ -98,9 +98,11 @@ def read_pdf(
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
if flavor not in ["lattice", "stream"]: layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream", "network", "hybrid"]:
raise NotImplementedError( raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'" "Unknown flavor specified."
" Use either 'lattice', 'stream', or 'network'"
) )
with warnings.catch_warnings(): with warnings.catch_warnings():
@ -108,7 +110,7 @@ def read_pdf(
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password) p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse( tables = p.parse(
flavor=flavor, flavor=flavor,

View File

@ -2,3 +2,5 @@
from .stream import Stream from .stream import Stream
from .lattice import Lattice from .lattice import Lattice
from .network import Network
from .hybrid import Hybrid

View File

@ -1,20 +1,484 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import warnings
from ..utils import get_page_layout, get_text_objects import numpy as np
import pandas as pd
from ..utils import (
bbox_from_str,
compute_accuracy,
compute_whitespace,
get_text_objects,
get_table_index,
text_in_bbox,
)
from ..core import Table
class BaseParser(object): class BaseParser():
"""Defines a base parser. """Defines a base parser.
""" """
def __init__(
self,
parser_id,
table_regions=None,
table_areas=None,
copy_text=None,
split_text=False,
strip_text="",
shift_text=None,
flag_size=False,
debug=False):
self.id = parser_id
self.table_regions = table_regions
self.table_areas = table_areas
self.table_bbox_parses = {}
def _generate_layout(self, filename, layout_kwargs): self.columns = None
self.copy_text = copy_text
self.split_text = split_text
self.strip_text = strip_text
self.shift_text = shift_text
self.flag_size = flag_size
self.rootname = None
self.t_bbox = None
# For plotting details of parsing algorithms
self.resolution = 300 # default plotting resolution of the PDF.
self.parse_details = {}
if not debug:
self.parse_details = None
def table_bboxes(self):
return sorted(
self.table_bbox_parses.keys(),
key=lambda x: x[1],
reverse=True
)
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) self.layout = layout
self.dimensions = dimensions
self.page = page_idx
self.images = get_text_objects(self.layout, ltype="image") self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") self.horizontal_text = get_text_objects(
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") self.layout,
ltype="horizontal_text"
)
self.vertical_text = get_text_objects(
self.layout,
ltype="vertical_text"
)
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
if self.parse_details is not None:
self.parse_details["table_regions"] = self.table_regions
self.parse_details["table_areas"] = self.table_areas
def _apply_regions_filter(self, textlines):
"""If regions have been specified, filter textlines to these regions.
Parameters
----------
textlines : list
list of textlines to be filtered
Returns
-------
filtered_textlines : list of textlines within the regions specified
"""
filtered_textlines = []
if self.table_regions is None:
filtered_textlines.extend(textlines)
else:
for region_str in self.table_regions:
region_text = text_in_bbox(
bbox_from_str(region_str),
textlines
)
filtered_textlines.extend(region_text)
return filtered_textlines
def _document_has_no_text(self):
"""Detects image only documents and warns.
Returns
-------
has_no_text : bool
Whether the document doesn't have any text at all.
"""
if not self.horizontal_text:
rootname = os.path.basename(self.rootname)
if self.images:
warnings.warn(
"{rootname} is image-based, "
"camelot only works on text-based pages."
.format(rootname=rootname)
)
else:
warnings.warn(
"No tables found on {rootname}".format(rootname=rootname)
)
return True
return False
def _initialize_new_table(self, table_idx, bbox, cols, rows):
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
bbox : set
bounding box of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
table : camelot.core.Table
"""
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
table._bbox = bbox
return table
@staticmethod
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell. Only useful for some parsers (e.g. Lattice), base method is a
noop.
"""
return idx
def compute_parse_errors(self, table):
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = type(self)._reduce_index(
table,
indices,
shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
return pos_errors
def _generate_columns_and_rows(self, bbox, user_cols):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table_bbox(self):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def extract_tables(self):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(self.table_bboxes()):
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
user_cols = self.columns[table_idx].split(",")
user_cols = [float(c) for c in user_cols]
else:
user_cols = None
cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox,
user_cols
)
table = self._generate_table(
table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table)
return _tables
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
table.flavor = self.id
table.filename = self.filename
table.parse = self.table_bbox_parses[table._bbox]
table.parse_details = self.parse_details
pos_errors = self.compute_parse_errors(table)
table.accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table.copy_spanning_text(self.copy_text)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
table.whitespace = compute_whitespace(data)
table.pdf_size = (self.pdf_width, self.pdf_height)
_text = []
_text.extend(
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table.textlines = self.horizontal_text + self.vertical_text
class TextBaseParser(BaseParser):
"""Base class for all text parsers.
"""
def __init__(
self,
parser_id,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
debug=False,
**kwargs):
super().__init__(
parser_id,
table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
flag_size=flag_size,
debug=debug,
)
self.columns = columns
self._validate_columns()
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = None
rows = []
temp = []
non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]):
if row_y is None:
row_y = t.y0
elif not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = TextBaseParser._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
]
cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, bbox, cols, rows)
table = table.set_all_edges()
self.record_parse_metadata(table)
return table
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._segments = None

View File

@ -0,0 +1,235 @@
# -*- coding: utf-8 -*-
from ..utils import (
bboxes_overlap,
boundaries_to_split_lines,
)
import numpy as np
from .base import BaseParser
from .network import Network
from .lattice import Lattice
class Hybrid(BaseParser):
"""Defines a hybrid parser, leveraging both network and lattice parsers.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=None,
row_tol=2,
column_tol=0,
debug=False,
**kwargs):
super().__init__(
"hybrid",
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
debug=debug,
)
self.columns = columns # Columns settings impacts the hybrid table
self.network_parser = Network(
table_regions=table_regions,
table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
self.lattice_parser = Lattice(
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
super().prepare_page_parse(filename, layout, dimensions,
page_idx, layout_kwargs)
self.network_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
self.lattice_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
def _generate_columns_and_rows(self, bbox, table_idx):
parser = self.table_bbox_parses[bbox]
return parser._generate_columns_and_rows(bbox, table_idx)
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
parser = self.table_bbox_parses[bbox]
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
# Because hybrid can inject extraneous splits from both lattice and
# network, remove lines / cols that are completely empty.
table.df = table.df.replace('', np.nan)
table.df = table.df.dropna(axis=0, how="all")
table.df = table.df.dropna(axis=1, how="all")
table.df = table.df.replace(np.nan, '')
table.shape = table.df.shape
return table
@staticmethod
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
""" Augment existing boundaries using provided hard splits.
Boundaries: |---| |-| |---------|
Splits: | | | |
Augmented: |-------|-----|-------|--|
"""
idx_boundaries = len(boundaries) - 1
idx_splits = len(splits) - 1
previous_boundary = None
while True:
if idx_splits < 0:
# No more splits to incorporate, we're done
break
split = splits[idx_splits]
if idx_boundaries < 0:
# Need to insert remaining splits
new_boundary = [split, boundaries[0][0]]
boundaries.insert(0, new_boundary)
idx_splits = idx_splits - 1
else:
boundary = \
boundaries[idx_boundaries]
if boundary[1] < \
split + tolerance:
# The lattice column is further to the right of our
# col boundary. We move our left boundary to match.
boundary[1] = split
# And if there was another segment after, we make its
# right boundary match as well so that there's no gap
if previous_boundary is not None:
previous_boundary[0] = split
idx_splits = idx_splits - 1
elif boundary[0] > \
split - tolerance:
# Our boundary is fully after the split, move on
idx_boundaries = idx_boundaries - 1
previous_boundary = boundary
if idx_boundaries < 0:
# If this is the last boundary to the left, set its
# edge at the split
boundary[0] = split
idx_splits = idx_splits - 1
else:
# The split is inside our boundary: split it
new_boundary = [split, boundary[1]]
boundaries.insert(idx_boundaries + 1, new_boundary)
boundary[1] = split
previous_boundary = new_boundary
idx_splits = idx_splits - 1
return boundaries
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
""" Identify splits that were only detected by lattice or by network
"""
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
lattice_cols = lattice_parse["col_anchors"]
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
network_cols_boundaries = network_bbox_data["cols_boundaries"]
# Favor network, but complete or adjust its columns based on the
# splits identified by lattice.
if network_cols_boundaries is None:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
else:
network_cols_boundaries = self._augment_boundaries_with_splits(
network_cols_boundaries,
lattice_cols,
self.lattice_parser.joint_tol)
augmented_bbox = (
network_cols_boundaries[0][0],
min(lattice_bbox[1], network_bbox[1]),
network_cols_boundaries[-1][1],
max(lattice_bbox[3], network_bbox[3]),
)
network_bbox_data["cols_anchors"] = \
boundaries_to_split_lines(network_cols_boundaries)
del self.network_parser.table_bbox_parses[network_bbox]
self.network_parser.table_bbox_parses[augmented_bbox] = \
network_bbox_data
self.table_bbox_parses[augmented_bbox] = self.network_parser
def _generate_table_bbox(self):
# Collect bboxes from both parsers
self.lattice_parser._generate_table_bbox()
_lattice_bboxes = sorted(
self.lattice_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
self.network_parser._generate_table_bbox()
_network_bboxes = sorted(
self.network_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
# Merge the data from both processes
for lattice_bbox in _lattice_bboxes:
merged = False
for idx in range(len(_network_bboxes)-1, -1, -1):
network_bbox = _network_bboxes[idx]
if not bboxes_overlap(lattice_bbox, network_bbox):
continue
self._merge_bbox_analysis(lattice_bbox, network_bbox)
# network_bbox_data["cols_boundaries"]
del _network_bboxes[idx]
merged = True
if not merged:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
# Add the bboxes from network that haven't been merged
for network_bbox in _network_bboxes:
self.table_bbox_parses[network_bbox] = self.network_parser

View File

@ -1,27 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import sys
import copy
import locale
import logging
import warnings
import subprocess
import numpy as np
import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table
from ..utils import ( from ..utils import (
build_file_path_in_temp_dir,
export_pdf_as_png,
scale_image, scale_image,
scale_pdf, scale_pdf,
segments_in_bbox, segments_in_bbox,
text_in_bbox, text_in_bbox_per_axis,
merge_close_lines, merge_close_lines,
get_table_index,
compute_accuracy,
compute_whitespace,
) )
from ..image_processing import ( from ..image_processing import (
adaptive_threshold, adaptive_threshold,
@ -31,9 +20,6 @@ from ..image_processing import (
) )
logger = logging.getLogger("camelot")
class Lattice(BaseParser): class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text """Lattice method of parsing looks for lines between text
to parse the table. to parse the table.
@ -79,7 +65,7 @@ class Lattice(BaseParser):
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
threshold_constant : int, optional (default: -2) threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
@ -101,7 +87,7 @@ class Lattice(BaseParser):
process_background=False, process_background=False,
line_scale=15, line_scale=15,
copy_text=None, copy_text=None,
shift_text=["l", "t"], shift_text=None,
split_text=False, split_text=False,
flag_size=False, flag_size=False,
strip_text="", strip_text="",
@ -111,23 +97,27 @@ class Lattice(BaseParser):
threshold_constant=-2, threshold_constant=-2,
iterations=0, iterations=0,
resolution=300, resolution=300,
**kwargs **kwargs):
): super().__init__(
self.table_regions = table_regions "lattice",
self.table_areas = table_areas table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
copy_text=copy_text,
shift_text=shift_text or ["l", "t"],
flag_size=flag_size,
)
self.process_background = process_background self.process_background = process_background
self.line_scale = line_scale self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.line_tol = line_tol self.line_tol = line_tol
self.joint_tol = joint_tol self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.iterations = iterations self.iterations = iterations
self.resolution = resolution self.resolution = resolution
self.image_path = None
self.pdf_image = None
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
@ -174,51 +164,13 @@ class Lattice(BaseParser):
indices.append((r_idx, c_idx, text)) indices.append((r_idx, c_idx, text))
return indices return indices
@staticmethod def record_parse_metadata(self, table):
def _copy_spanning_text(t, copy_text=None): """Record data about the origin of the table
"""Copies over text in empty spanning cells.
Parameters
----------
t : camelot.core.Table
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
""" """
for f in copy_text: super().record_parse_metadata(table)
if f == "h": # for plotting
for i in range(len(t.cells)): table._image = self.pdf_image # Reuse the image used for calc
for j in range(len(t.cells[i])): table._segments = (self.vertical_segments, self.horizontal_segments)
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].text = t.cells[i - 1][j].text
return t
def _generate_image(self):
from ..ext.ghostscript import Ghostscript
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null) as gs:
pass
null.close()
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas): def scale_areas(areas):
@ -233,20 +185,26 @@ class Lattice(BaseParser):
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas return scaled_areas
self.image, self.threshold = adaptive_threshold( self.image_path = build_file_path_in_temp_dir(
self.imagename, os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self.image_path, self.resolution)
self.pdf_image, self.threshold = adaptive_threshold(
self.image_path,
process_background=self.process_background, process_background=self.process_background,
blocksize=self.threshold_blocksize, blocksize=self.threshold_blocksize,
c=self.threshold_constant, c=self.threshold_constant,
) )
image_width = self.image.shape[1] image_width = self.pdf_image.shape[1]
image_height = self.image.shape[0] image_height = self.pdf_image.shape[0]
image_width_scaler = image_width / float(self.pdf_width) image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height) image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width) pdf_width_scaler = self.pdf_width / float(image_width)
pdf_height_scaler = self.pdf_height / float(image_height) pdf_height_scaler = self.pdf_height / float(image_height)
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) image_scalers = (image_width_scaler,
image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
if self.table_areas is None: if self.table_areas is None:
@ -288,46 +246,88 @@ class Lattice(BaseParser):
areas = scale_areas(self.table_areas) areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask) table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox) [
self.table_bbox_parses,
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( self.vertical_segments,
self.horizontal_segments
] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_bbox, vertical_segments, horizontal_segments, pdf_scalers
) )
def _generate_columns_and_rows(self, table_idx, tk): for bbox, parse in self.table_bbox_parses.items():
# select elements which lie within table_bbox joints = parse["joints"]
t_bbox = {}
v_s, h_s = segments_in_bbox( # Merge x coordinates that are close together
tk, self.vertical_segments, self.horizontal_segments line_tol = self.line_tol
# Sort the joints, make them a list of lists (instead of sets)
joints_normalized = list(
map(
lambda x: list(x),
sorted(joints, key=lambda j: - j[0])
) )
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) )
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) for idx in range(1, len(joints_normalized)):
x_left, x_right = \
joints_normalized[idx-1][0], joints_normalized[idx][0]
if x_left - line_tol <= x_right <= x_left + line_tol:
joints_normalized[idx][0] = x_left
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) # Merge y coordinates that are close together
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
for idx in range(1, len(joints_normalized)):
y_bottom, y_top = \
joints_normalized[idx-1][1], joints_normalized[idx][1]
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
joints_normalized[idx][1] = y_bottom
self.t_bbox = t_bbox # FRHTODO: check this is useful, otherwise get rid of the code
# above
parse["joints_normalized"] = joints_normalized
cols = list(map(lambda coords: coords[0], joints))
cols.extend([bbox[0], bbox[2]])
rows = list(map(lambda coords: coords[1], joints))
rows.extend([bbox[1], bbox[3]])
cols, rows = zip(*self.table_bbox[tk])
cols, rows = list(cols), list(rows)
cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) rows = merge_close_lines(
# make grid using x and y coord of shortlisted rows and cols sorted(rows, reverse=True),
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] line_tol=self.line_tol
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] )
parse["col_anchors"] = cols
parse["row_anchors"] = rows
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox
v_s, h_s = segments_in_bbox(
bbox, self.vertical_segments, self.horizontal_segments
)
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
parse = self.table_bbox_parses[bbox]
# make grid using x and y coord of shortlisted rows and cols
cols = [
(parse["col_anchors"][i], parse["col_anchors"][i + 1])
for i in range(0, len(parse["col_anchors"]) - 1)
]
rows = [
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
for i in range(0, len(parse["row_anchors"]) - 1)
]
return cols, rows, v_s, h_s return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
v_s = kwargs.get("v_s") v_s = kwargs.get("v_s")
h_s = kwargs.get("h_s") h_s = kwargs.get("h_s")
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname)) raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows) table = self._initialize_new_table(table_idx, bbox, cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True
@ -335,81 +335,5 @@ class Lattice(BaseParser):
# set spanning cells to True # set spanning cells to True
table = table.set_span() table = table.set_span()
pos_errors = [] self.record_parse_metadata(table)
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text:
if self.images:
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else:
warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname))
)
return []
self._generate_image()
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk
_tables.append(table)
return _tables

View File

@ -0,0 +1,726 @@
# -*- coding: utf-8 -*-
"""Implementation of network table parser."""
from __future__ import division
import copy
import math
import numpy as np
from .base import TextBaseParser
from ..core import (
TextAlignments,
ALL_ALIGNMENTS,
HORIZONTAL_ALIGNMENTS,
VERTICAL_ALIGNMENTS
)
from ..utils import (
bbox_from_str,
text_in_bbox,
textlines_overlapping_bbox,
bbox_from_textlines,
find_columns_boundaries,
boundaries_to_split_lines,
text_in_bbox_per_axis,
)
# maximum number of columns over which a header can spread
MAX_COL_SPREAD_IN_HEADER = 3
# Minimum number of textlines in a table
MINIMUM_TEXTLINES_IN_TABLE = 6
def column_spread(left, right, col_anchors):
"""Get the number of columns crossed by a segment [left, right]."""
index_left = 0
while index_left < len(col_anchors) \
and col_anchors[index_left] < left:
index_left += 1
index_right = index_left
while index_right < len(col_anchors) \
and col_anchors[index_right] < right:
index_right += 1
return index_right - index_left
def find_closest_tls(bbox, tls):
""" Search for tls that are the closest but outside in all 4 directions
"""
left, right, top, bottom = None, None, None, None
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
for textline in tls:
if textline.x1 < bbox_left:
# Left: check it overlaps horizontally
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
continue
if left is None or left.x1 < textline.x1:
left = textline
elif bbox_right < textline.x0:
# Right: check it overlaps horizontally
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
continue
if right is None or right.x0 > textline.x0:
right = textline
else:
# Either bottom or top: must overlap vertically
if textline.x0 > bbox_right or textline.x1 < bbox_left:
continue
if textline.y1 < bbox_bottom:
# Bottom
if bottom is None or bottom.y1 < textline.y1:
bottom = textline
elif bbox_top < textline.y0:
# Top
if top is None or top.y0 > textline.y0:
top = textline
return {
"left": left,
"right": right,
"top": top,
"bottom": bottom,
}
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
"""Expand a bbox vertically up by looking for plausible headers.
The core algorithm is based on fairly strict alignment of text. It works
for the table body, but might fail on tables' headers since they tend to be
in a different font, alignment (e.g. vertical), etc.
This method evalutes the area above the table body's bbox for
characteristics of a table header: close to the top of the body, with cells
that fit within the horizontal bounds identified.
"""
new_bbox = body_bbox
(left, bottom, right, top) = body_bbox
zones = []
keep_searching = True
while keep_searching:
keep_searching = False
# a/ first look for the closest text element above the bbox.
# It will be the anchor for a possible new row.
closest_above = None
all_above = []
for textline in textlines:
# higher than the table, >50% within its bounds
textline_center = 0.5 * (textline.x0 + textline.x1)
if textline.y0 > top and left < textline_center < right:
all_above.append(textline)
if closest_above is None or closest_above.y0 > textline.y0:
closest_above = textline
if closest_above and closest_above.y0 < top + max_v_gap:
# b/ We have a candidate cell that is within the correct
# vertical band, and directly above the table. Starting from
# this anchor, we list all the textlines within the same row.
tls_in_new_row = []
top = closest_above.y1
pushed_up = True
while pushed_up:
pushed_up = False
# Iterate and extract elements that fit in the row
# from our list
for i in range(len(all_above) - 1, -1, -1):
textline = all_above[i]
if textline.y0 < top:
# The bottom of this element is within our row
# so we add it.
tls_in_new_row.append(textline)
all_above.pop(i)
if textline.y1 > top:
# If the top of this element raises our row's
# band, we'll need to keep on searching for
# overlapping items
top = textline.y1
pushed_up = True
# Get the x-ranges for all the textlines, and merge the
# x-ranges that overlap
zones = zones + list(
map(
lambda textline: [textline.x0, textline.x1],
tls_in_new_row
)
)
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
# Starting from the right, if two zones overlap horizontally,
# merge them
merged_something = True
while merged_something:
merged_something = False
for i in range(len(zones) - 1, 0, -1):
zone_right = zones[i]
zone_left = zones[i-1]
if zone_left[1] >= zone_right[0]:
zone_left[1] = max(zone_right[1], zone_left[1])
zones.pop(i)
merged_something = True
max_spread = max(
list(
map(
lambda zone: column_spread(
zone[0], zone[1], col_anchors),
zones
)
)
)
# Accept textlines that cross columns boundaries, as long as they
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
# columns.
# This is to avoid picking unrelated paragraphs.
if max_spread <= min(
MAX_COL_SPREAD_IN_HEADER,
math.ceil(len(col_anchors) / 2)):
# Combined, the elements we've identified don't cross more
# than the authorized number of columns.
# We're trying to avoid
# 0: <BAD: Added header spans too broad>
# 1: <A1> <B1> <C1> <D1> <E1>
# 2: <A2> <B2> <C2> <D2> <E2>
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
new_bbox = (left, bottom, right, top)
# At this stage we've identified a plausible row (or the
# beginning of one).
keep_searching = True
return new_bbox
class AlignmentCounter():
"""
For a given textline, represent all other textlines aligned with it.
A textline can be vertically aligned with others if their bbox match on
left, right, or middle coord, and horizontally aligned if they match top,
bottom, or center coord.
"""
def __init__(self):
self.alignment_to_occurrences = {}
for alignment in ALL_ALIGNMENTS:
self.alignment_to_occurrences[alignment] = []
def __getitem__(self, key):
return self.alignment_to_occurrences[key]
def __setitem__(self, key, value):
self.alignment_to_occurrences[key] = value
return value
def max_alignments(self, alignment_ids=None):
"""Get the alignment dimension with the max number of textlines.
"""
alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
alignment_items = map(
lambda alignment_id: (
alignment_id,
self.alignment_to_occurrences[alignment_id]
),
alignment_ids
)
return max(alignment_items, key=lambda item: len(item[1]))
def max_v(self):
"""Tuple (alignment_id, textlines) of largest vertical row.
"""
# Note that the horizontal alignments (left, center, right) are aligned
# vertically in a column, so max_v is calculated by looking at
# horizontal alignments.
return self.max_alignments(HORIZONTAL_ALIGNMENTS)
def max_h(self):
"""Tuple (alignment_id, textlines) of largest horizontal col.
"""
return self.max_alignments(VERTICAL_ALIGNMENTS)
def max_v_count(self):
"""Returns the maximum number of alignments along
one of the vertical axis (left/right/middle).
"""
return len(self.max_v()[1])
def max_h_count(self):
"""Returns the maximum number of alignments along
one of the horizontal axis (bottom/top/center).
"""
return len(self.max_h()[1])
def alignment_score(self):
"""We define the alignment score of a textline as the product of the
number of aligned elements - 1. The -1 is to avoid favoring
singletons on a long line.
"""
return (self.max_v_count()-1) * (self.max_h_count()-1)
class TextNetworks(TextAlignments):
"""Text elements connected by vertical AND horizontal alignments.
The alignment dict has six keys based on the hor/vert alignments,
and each key's value is a list of camelot.core.TextAlignment objects.
"""
def __init__(self):
super().__init__(ALL_ALIGNMENTS)
# For each textline, dictionary "alignment type" to
# "number of textlines aligned"
self._textline_to_alignments = {}
def _update_alignment(self, alignment, coord, textline):
alignment.register_aligned_textline(textline, coord)
def _register_all_text_lines(self, textlines):
"""Add all textlines to our network repository to
identify alignments.
"""
# Identify all the alignments
for textline in textlines:
if len(textline.get_text().strip()) > 0:
self._register_textline(textline)
def _compute_alignment_counts(self):
"""Build a dictionary textline -> alignment object.
"""
for align_id, textedges in self._text_alignments.items():
for textedge in textedges:
for textline in textedge.textlines:
alignments = self._textline_to_alignments.get(
textline, None)
if alignments is None:
alignments = AlignmentCounter()
self._textline_to_alignments[textline] = alignments
alignments[align_id] = textedge.textlines
def remove_unconnected_edges(self):
"""Weed out elements which are only connected to others vertically
or horizontally. There needs to be connections across both
dimensions.
"""
removed_singletons = True
while removed_singletons:
removed_singletons = False
for text_alignments in self._text_alignments.values():
# For each alignment edge, remove items if they are singletons
# either horizontally or vertically
for text_alignment in text_alignments:
for i in range(len(text_alignment.textlines) - 1, -1, -1):
textline = text_alignment.textlines[i]
alignments = self._textline_to_alignments[textline]
if alignments.max_h_count() <= 1 or \
alignments.max_v_count() <= 1:
del text_alignment.textlines[i]
removed_singletons = True
self._textline_to_alignments = {}
self._compute_alignment_counts()
def most_connected_textline(self):
""" Retrieve the textline that is most connected across vertical and
horizontal axis.
"""
# Find the textline with the highest alignment score, with a tie break
# to prefer textlines further down in the table. Starting the search
# from the table's bottom allows the algo to collect data on more cells
# before going to the header, typically harder to parse.
return max(
self._textline_to_alignments.keys(),
key=lambda textline:
(
self._textline_to_alignments[textline].alignment_score(),
-textline.y0, -textline.x0
),
default=None
)
def compute_plausible_gaps(self):
""" Evaluate plausible gaps between cells horizontally and vertically
based on the textlines aligned with the most connected textline.
Returns
-------
gaps_hv : tuple
(horizontal_gap, horizontal_gap) in pdf coordinate space.
"""
# Determine the textline that has the most combined
# alignments across horizontal and vertical axis.
# It will serve as a reference axis along which to collect the average
# spacing between rows/cols.
most_aligned_tl = self.most_connected_textline()
if most_aligned_tl is None:
return None
# Retrieve the list of textlines it's aligned with, across both
# axis
best_alignment = self._textline_to_alignments[most_aligned_tl]
__, ref_h_textlines = best_alignment.max_h()
__, ref_v_textlines = best_alignment.max_v()
if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
return None
h_textlines = sorted(
ref_h_textlines,
key=lambda textline: textline.x0,
reverse=True
)
v_textlines = sorted(
ref_v_textlines,
key=lambda textline: textline.y0,
reverse=True
)
h_gaps, v_gaps = [], []
for i in range(1, len(v_textlines)):
v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0)
for i in range(1, len(h_textlines)):
h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0)
if (not h_gaps or not v_gaps):
return None
percentile = 75
gaps_hv = (
2.0 * np.percentile(h_gaps, percentile),
2.0 * np.percentile(v_gaps, percentile)
)
return gaps_hv
def search_table_body(self, gaps_hv, parse_details=None):
""" Build a candidate bbox for the body of a table using network algo
Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold.
Parameters
----------
gaps_hv : tuple
The maximum distance allowed to consider surrounding lines/columns
as part of the same table.
parse_details : array (optional)
Optional parameter array, in which to store extra information
to help later visualization of the table creation.
"""
# First, determine the textline that has the most combined
# alignments across horizontal and vertical axis.
# It will serve both as a starting point for the table boundary
# search, and as a way to estimate the average spacing between
# rows/cols.
most_aligned_tl = self.most_connected_textline()
# Calculate the 75th percentile of the horizontal/vertical
# gaps between textlines. Use this as a reference for a threshold
# to not exceed while looking for table boundaries.
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
if parse_details is not None:
# Store debug info
parse_details_search = {
"max_h_gap": max_h_gap,
"max_v_gap": max_v_gap,
"iterations": []
}
parse_details.append(parse_details_search)
else:
parse_details_search = None
bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1]
# For the body of the table, we only consider cells that have
# alignments on both axis.
tls_search_space = list(self._textline_to_alignments.keys())
# tls_search_space = []
tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl]
last_bbox = None
last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
while last_bbox != bbox:
if parse_details_search is not None:
# Store debug info
parse_details_search["iterations"].append(bbox)
# Check that the closest tls are within the gaps allowed
last_bbox = bbox
cand_bbox = last_bbox.copy()
closest_tls = find_closest_tls(bbox, tls_search_space)
for direction, textline in closest_tls.items():
if textline is None:
continue
expanded_cand_bbox = cand_bbox.copy()
if direction == "left":
if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]:
continue
expanded_cand_bbox[0] = textline.x0
elif direction == "right":
if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
continue
expanded_cand_bbox[2] = textline.x1
elif direction == "bottom":
if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]:
continue
expanded_cand_bbox[1] = textline.y0
elif direction == "top":
if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
continue
expanded_cand_bbox[3] = textline.y1
# If they are, see what an expanded bbox in that direction
# would contain
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
tls_in_new_box = new_tls + tls_in_bbox
# And if we're expanding up or down, check that the addition
# of the new row won't reduce the number of columns.
# This happens when text covers multiple rows - that's only
# allowed in the header, treated separately.
cols_bounds = find_columns_boundaries(tls_in_new_box)
if direction in ["bottom", "top"] and \
len(cols_bounds) < len(last_cols_bounds):
continue
# We have an expansion candidate: register it, update the
# search space and repeat
# We use bbox_from_textlines instead of cand_bbox in case some
# overlapping textlines require a large bbox for strict fit.
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
last_cols_bounds = cols_bounds
tls_in_bbox.extend(new_tls)
for i in range(len(tls_search_space) - 1, -1, -1):
textline = tls_search_space[i]
if textline in new_tls:
del tls_search_space[i]
if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
return bbox
return None
def generate(self, textlines):
"""Generate the text edge dictionaries based on the
input textlines.
"""
self._register_all_text_lines(textlines)
self._compute_alignment_counts()
class Network(TextBaseParser):
"""Network method of parsing looks for spaces between text
to parse the table.
If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=None,
row_tol=2,
column_tol=0,
debug=False,
**kwargs):
super().__init__(
"network",
table_regions=table_regions,
table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
def _generate_table_bbox(self):
user_provided_bboxes = None
if self.table_areas is not None:
# User gave us table areas already. We will use their coordinates
# to find column anchors.
user_provided_bboxes = []
for area_str in self.table_areas:
user_provided_bboxes.append(bbox_from_str(area_str))
# Take all the textlines that are not just spaces
all_textlines = [
t for t in self.horizontal_text + self.vertical_text
if len(t.get_text().strip()) > 0
]
textlines = self._apply_regions_filter(all_textlines)
textlines_processed = {}
self.table_bbox_parses = {}
if self.parse_details is not None:
parse_details_network_searches = []
self.parse_details["network_searches"] = \
parse_details_network_searches
parse_details_bbox_searches = []
self.parse_details["bbox_searches"] = parse_details_bbox_searches
self.parse_details["col_searches"] = []
else:
parse_details_network_searches = None
parse_details_bbox_searches = None
while True:
# Find a bbox: either pulling from the user's or from the network
# algorithm.
# First look for the body of the table
bbox_body = None
if user_provided_bboxes is not None:
if len(user_provided_bboxes) > 0:
bbox_body = user_provided_bboxes.pop()
else:
text_network = TextNetworks()
text_network.generate(textlines)
text_network.remove_unconnected_edges()
gaps_hv = text_network.compute_plausible_gaps()
if gaps_hv is None:
return None
# edge_tol instructions override the calculated vertical gap
edge_tol_hv = (
gaps_hv[0],
gaps_hv[1] if self.edge_tol is None else self.edge_tol
)
bbox_body = text_network.search_table_body(
edge_tol_hv,
parse_details=parse_details_bbox_searches
)
if parse_details_network_searches is not None:
# Preserve the current edge calculation for debugging
parse_details_network_searches.append(
copy.deepcopy(text_network)
)
if bbox_body is None:
break
# Get all the textlines that overlap with the box, compute
# columns
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
cols_boundaries = find_columns_boundaries(tls_in_bbox)
cols_anchors = boundaries_to_split_lines(cols_boundaries)
# Unless the user gave us strict bbox_body, try to find a header
# above the body to build the full bbox.
if user_provided_bboxes is not None:
bbox_full = bbox_body
else:
# Expand the text box to fully contain the tls we found
bbox_body = bbox_from_textlines(tls_in_bbox)
# Apply a heuristic to salvage headers which formatting might
# be off compared to the rest of the table.
bbox_full = search_header_from_body_bbox(
bbox_body,
textlines,
cols_anchors,
gaps_hv[1]
)
table_parse = {
"bbox_body": bbox_body,
"cols_boundaries": cols_boundaries,
"cols_anchors": cols_anchors,
"bbox_full": bbox_full
}
self.table_bbox_parses[bbox_full] = table_parse
if self.parse_details is not None:
self.parse_details["col_searches"].append(table_parse)
# Remember what textlines we processed, and repeat
for textline in tls_in_bbox:
textlines_processed[textline] = None
textlines = list(filter(
lambda textline: textline not in textlines_processed,
textlines
))
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
all_tls = list(
sorted(
filter(
lambda textline: len(textline.get_text().strip()) > 0,
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
),
key=lambda textline: (-textline.y0, textline.x0)
)
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
all_tls
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
if user_cols is not None:
cols = [text_x_min] + user_cols + [text_x_max]
cols = [
(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)
]
else:
parse_details = self.table_bbox_parses[bbox]
col_anchors = parse_details["cols_anchors"]
cols = list(map(
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
range(0, len(col_anchors) - 1)
))
return cols, rows, None, None

View File

@ -1,21 +1,18 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
import logging
import warnings import warnings
import numpy as np from .base import TextBaseParser
import pandas as pd from ..core import TextEdges
from ..utils import (
from .base import BaseParser bbox_from_str,
from ..core import TextEdges, Table bbox_from_textlines,
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace text_in_bbox,
text_in_bbox_per_axis,
)
logger = logging.getLogger("camelot") class Stream(TextBaseParser):
class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text """Stream method of parsing looks for spaces between text
to parse the table. to parse the table.
@ -59,214 +56,31 @@ class Stream(BaseParser):
table_regions=None, table_regions=None,
table_areas=None, table_areas=None,
columns=None, columns=None,
split_text=False,
flag_size=False, flag_size=False,
split_text=False,
strip_text="", strip_text="",
edge_tol=50, edge_tol=50,
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
**kwargs **kwargs):
): super().__init__(
self.table_regions = table_regions "stream",
self.table_areas = table_areas table_regions=table_regions,
self.columns = columns table_areas=table_areas,
self._validate_columns() columns=columns,
self.split_text = split_text flag_size=flag_size,
self.flag_size = flag_size split_text=split_text,
self.strip_text = strip_text strip_text=strip_text,
self.edge_tol = edge_tol edge_tol=edge_tol,
self.row_tol = row_tol row_tol=row_tol,
self.column_tol = column_tol column_tol=column_tol,
)
@staticmethod self.textedges = []
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = 0
rows = []
temp = []
for t in text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # TODO: hacky
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_mids = [
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
for r in rows_grouped
]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return rows
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
]
cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns" " should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis. described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
Assumes that tables are situated relatively far apart Assumes that tables are situated relatively far apart
vertically. vertically.
@ -283,65 +97,59 @@ class Stream(BaseParser):
# guess table areas using textlines and relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found # treat whole page as table area if no table areas found
if not len(table_bbox): if not table_bbox:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox return table_bbox
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
table._textedges = self.textedges
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is None: if self.table_areas is None:
hor_text = self.horizontal_text hor_text = self.horizontal_text
if self.table_regions is not None: if self.table_regions is not None:
# filter horizontal text # filter horizontal text
hor_text = [] hor_text = []
for region in self.table_regions: for region_str in self.table_regions:
x1, y1, x2, y2 = region.split(",") region_text = text_in_bbox(
x1 = float(x1) bbox_from_str(region_str),
y1 = float(y1) self.horizontal_text)
x2 = float(x2)
y2 = float(y2)
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text) hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm # find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text) table_bbox_parses = self._nurminen_table_detection(hor_text)
else: else:
table_bbox = {} table_bbox_parses = {}
for area in self.table_areas: for area_str in self.table_areas:
x1, y1, x2, y2 = area.split(",") table_bbox_parses[bbox_from_str(area_str)] = None
x1 = float(x1) self.table_bbox_parses = table_bbox_parses
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox # select elements which lie within table_bbox
t_bbox = {} self.t_bbox = text_in_bbox_per_axis(
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) bbox,
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) self.horizontal_text,
self.vertical_text
)
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
self.t_bbox = t_bbox rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "": if user_cols is not None:
# user has to input boundary columns too cols = [text_x_min] + user_cols + [text_x_max]
# take (0, pdf_width) by default cols = [
# similar to else condition (cols[i], cols[i + 1])
# len can't be 1 for i in range(0, len(cols) - 1)
cols = self.columns[table_idx].split(",") ]
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
# calculate mode of the list of number of elements in # calculate mode of the list of number of elements in
# each row to guess the number of columns # each row to guess the number of columns
@ -353,14 +161,22 @@ class Stream(BaseParser):
# see if the list contains elements, if yes, then use # see if the list contains elements, if yes, then use
# the mode after removing 1s # the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements)) elements = list(filter(lambda x: x != 1, elements))
if len(elements): if elements:
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
else: else:
warnings.warn( warnings.warn(
f"No tables found in table area {table_idx + 1}" f"No tables found in table area {bbox}"
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
) )
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -383,80 +199,4 @@ class Stream(BaseParser):
cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows, None, None
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows)
table = table.set_all_edges()
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "stream"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = None
table._segments = None
table._textedges = self.textedges
return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
base_filename = os.path.basename(self.rootname)
if not suppress_stdout:
logger.info(f"Processing {base_filename}")
if not self.horizontal_text:
if self.images:
warnings.warn(
f"{base_filename} is image-based, camelot only works on"
" text-based pages."
)
else:
warnings.warn(f"No tables found on {base_filename}")
return []
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows)
table._bbox = tk
_tables.append(table)
return _tables

View File

@ -8,9 +8,164 @@ except ImportError:
else: else:
_HAS_MPL = True _HAS_MPL = True
from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
class PlotMethods(object): from pdfminer.layout import (
def __call__(self, table, kind="text", filename=None): LTTextLineVertical,
)
def extend_axe_lim(ax, bbox, margin=10):
"""Ensure the ax limits include the input bbox
"""
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
def draw_labeled_bbox(
ax, bbox, text,
color="black", linewidth=3,
linestyle="solid",
label_pos="top,left",
fontsize=12,
):
"""Utility drawing function to draw a box with an associated text label
"""
ax.add_patch(
patches.Rectangle(
(bbox[0], bbox[1]),
bbox[2] - bbox[0], bbox[3] - bbox[1],
color=color,
linewidth=linewidth, linestyle=linestyle,
fill=False
)
)
vlabel, hlabel = label_pos.split(",")
if vlabel == "top":
y = max(bbox[1], bbox[3])
elif vlabel == "bottom":
y = min(bbox[1], bbox[3])
else:
y = 0.5 * (bbox[1] + bbox[3])
# We want to draw the label outside the box (above or below)
label_align_swap = {
"top": "bottom",
"bottom": "top",
"center": "center"
}
vlabel_out_of_box = label_align_swap[vlabel]
if hlabel == "right":
x = max(bbox[0], bbox[2])
elif hlabel == "left":
x = min(bbox[0], bbox[2])
else:
x = 0.5 * (bbox[0] + bbox[2])
ax.text(
x, y,
text,
fontsize=fontsize, color="black",
verticalalignment=vlabel_out_of_box,
horizontalalignment=hlabel,
bbox=dict(facecolor=color, alpha=0.1)
)
def draw_pdf(table, ax):
"""Draw the content of the table's source pdf into the passed subplot
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
"""
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
def draw_parse_constraints(table, ax):
"""Draw any user provided constraints (area, region, columns, etc)
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
"""
if table.parse_details:
zone_constraints = {
"region": "table_regions",
"area": "table_areas",
}
for zone_name, zone_id in zone_constraints.items():
# Display a bbox per region / area
for zone_str in table.parse_details[zone_id] or []:
draw_labeled_bbox(
ax, bbox_from_str(zone_str),
"{zone_name}: ({zone_str})".format(
zone_name=zone_name,
zone_str=zone_str
),
color="purple",
linestyle="dotted",
linewidth=1,
label_pos="bottom,right"
)
def draw_text(table, ax):
"""Draw text, horizontal in blue, vertical in red
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
"""
bbox = bbox_from_textlines(table.textlines)
for t in table.textlines:
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
ax.add_patch(
patches.Rectangle(
(t.x0, t.y0),
t.x1 - t.x0,
t.y1 - t.y0,
color=color,
alpha=0.2
)
)
extend_axe_lim(ax, bbox)
def prepare_plot(table, ax=None):
"""Initialize plot and draw common components
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
ax : matplotlib.axes.Axes
"""
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
return ax
class PlotMethods():
def __call__(self, table, kind="text", filename=None, ax=None):
"""Plot elements found on PDF page based on kind """Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different specified, useful for debugging and playing with different
parameters to get the best output. parameters to get the best output.
@ -20,7 +175,8 @@ class PlotMethods(object):
table: camelot.core.Table table: camelot.core.Table
A Camelot Table. A Camelot Table.
kind : str, optional (default: 'text') kind : str, optional (default: 'text')
{'text', 'grid', 'contour', 'joint', 'line'} {'text', 'grid', 'contour', 'joint', 'line',
'network_table_search'}
The element type for which a plot should be generated. The element type for which a plot should be generated.
filepath: str, optional (default: None) filepath: str, optional (default: None)
Absolute path for saving the generated plot. Absolute path for saving the generated plot.
@ -37,53 +193,49 @@ class PlotMethods(object):
raise NotImplementedError( raise NotImplementedError(
f"Lattice flavor does not support kind='{kind}'" f"Lattice flavor does not support kind='{kind}'"
) )
elif table.flavor == "stream" and kind in ["joint", "line"]: if table.flavor != "lattice" and kind in ["line"]:
raise NotImplementedError( raise NotImplementedError(
f"Stream flavor does not support kind='{kind}'" f"{table.flavor} flavor does not support kind='{kind}'"
) )
plot_method = getattr(self, kind) plot_method = getattr(self, kind)
return plot_method(table) return plot_method(table, ax)
def text(self, table): @staticmethod
def text(table, ax=None):
"""Generates a plot for all text elements present """Generates a plot for all text elements present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal") draw_text(table, ax)
xs, ys = [], [] return ax.get_figure()
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig
def grid(self, table): @staticmethod
def grid(table, ax=None):
"""Generates a plot for the detected table grids """Generates a plot for the detected table grids
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
for row in table.cells: for row in table.cells:
for cell in row: for cell in row:
if cell.left: if cell.left:
@ -94,130 +246,247 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
return fig return ax.get_figure()
def contour(self, table): @staticmethod
def contour(table, ax=None):
"""Generates a plot for all table boundaries present """Generates a plot for all table boundaries present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
try: _FOR_LATTICE = table.flavor == "lattice"
img, table_bbox = table._image ax = prepare_plot(table, ax)
_FOR_LATTICE = True
except TypeError:
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
if not _FOR_LATTICE: if not _FOR_LATTICE:
for t in table._text: draw_text(table, ax)
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
)
)
for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" (table._bbox[0], table._bbox[1]),
table._bbox[2] - table._bbox[0],
table._bbox[3] - table._bbox[1],
fill=False, color="red"
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
xs.extend([t[0], t[2]]) extend_axe_lim(ax, table._bbox)
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
if _FOR_LATTICE: return ax.get_figure()
ax.imshow(img)
return fig
def textedge(self, table): @staticmethod
def textedge(table, ax=None):
"""Generates a plot for relevant textedges. """Generates a plot for relevant textedges.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal") draw_text(table, ax)
xs, ys = [], []
for t in table._text: if table.flavor == "network":
xs.extend([t[0], t[2]]) for network in table.parse_details["network_searches"]:
ys.extend([t[1], t[3]]) most_connected_tl = network.most_connected_textline()
ax.add_patch( ax.add_patch(
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") patches.Rectangle(
(most_connected_tl.x0, most_connected_tl.y0),
most_connected_tl.x1 - most_connected_tl.x0,
most_connected_tl.y1 - most_connected_tl.y0,
color="red",
alpha=0.5
)
)
for tl in sorted(
network._textline_to_alignments.keys(),
key=lambda textline: (-textline.y0, textline.x0)
):
alignments = network._textline_to_alignments[tl]
coords = get_textline_coords(tl)
alignment_id_h, tls_h = alignments.max_v()
alignment_id_v, tls_v = alignments.max_h()
xs = list(map(lambda tl: tl.x0, tls_v))
ys = list(map(lambda tl: tl.y1, tls_h))
top_h = max(ys)
ax.text(
coords[alignment_id_h],
top_h + 5,
"{max_h_count}".format(max_h_count=len(tls_h)),
verticalalignment="bottom",
horizontalalignment="center",
fontsize=8,
color="green"
)
ax.plot(
[coords[alignment_id_h]] * len(ys), ys,
color="green",
linestyle="solid",
linewidth=1,
marker="o",
markersize=3
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
left_v = min(map(lambda tl: tl.x0, tls_v))
ax.text(
left_v - 5,
coords[alignment_id_v],
"{max_v_count}".format(max_v_count=len(tls_v)),
verticalalignment="center",
horizontalalignment="right",
fontsize=8,
color="blue"
)
ax.plot(
xs, [coords[alignment_id_v]] * len(xs),
color="blue",
linestyle="solid",
linewidth=1,
marker="o",
markersize=3
)
else:
for te in table._textedges: for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1]) ax.plot([te.coord, te.coord], [te.y0, te.y1])
return ax.get_figure()
return fig @staticmethod
def joint(table, ax=None):
def joint(self, table):
"""Generates a plot for all line intersections present """Generates a plot for all line intersections present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
img, table_bbox = table._image ax = prepare_plot(table, ax)
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
x_coord = [] x_coord = []
y_coord = [] y_coord = []
for k in table_bbox.keys(): for coord in table.parse["joints"]:
for coord in table_bbox[k]:
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro") ax.plot(x_coord, y_coord, "ro")
ax.imshow(img) return ax.get_figure()
return fig
def line(self, table): @staticmethod
def line(table, ax=None):
"""Generates a plot for all line segments present """Generates a plot for all line segments present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
vertical, horizontal = table._segments vertical, horizontal = table._segments
for v in vertical: for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
return fig return ax.get_figure()
@staticmethod
def network_table_search(table, ax=None):
"""Generates a plot illustrating the steps of the network table search.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
ax = prepare_plot(table, ax)
if table.parse_details is None:
return ax.get_figure()
parse_details = table.parse_details
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
max_h_gap = bbox_search["max_h_gap"]
max_v_gap = bbox_search["max_v_gap"]
iterations = bbox_search["iterations"]
for iteration, bbox in enumerate(iterations):
final = iteration == len(iterations) - 1
draw_labeled_bbox(
ax, bbox,
"t{box_id}/i{iteration}".format(
box_id=box_id,
iteration=iteration
),
color="red",
linewidth=5 if final else 2,
fontsize=12 if final else 8,
label_pos="bottom,left"
)
ax.add_patch(
patches.Rectangle(
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
bbox[2] - bbox[0] + 2 * max_h_gap,
bbox[3] - bbox[1] + 2 * max_v_gap,
color="orange",
fill=False
)
)
for box_id, col_search in enumerate(parse_details["col_searches"]):
draw_labeled_bbox(
ax, col_search["bbox_full"],
"box body + header #{box_id}".format(
box_id=box_id
),
color="red",
linewidth=4,
label_pos="top,left"
)
draw_labeled_bbox(
ax, col_search["bbox_body"],
"box body #{box_id}".format(
box_id=box_id
),
color="orange",
linewidth=2,
label_pos="bottom,left"
)
for col_anchor in col_search["cols_anchors"]:
# Display a green line at the col boundary line throughout the
# table bbox.
ax.plot(
[col_anchor, col_anchor],
[
col_search["bbox_body"][1] - 10,
col_search["bbox_body"][3] + 10,
],
color="green"
)
return ax.get_figure()

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import atexit
import re import re
import random import random
import shutil import shutil
@ -9,8 +10,10 @@ import tempfile
import warnings import warnings
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
from urllib.request import Request
import numpy as np import numpy as np
import pandas as pd
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -27,7 +30,9 @@ from pdfminer.layout import (
LTImage, LTImage,
) )
from urllib.request import Request, urlopen from .ext.ghostscript import Ghostscript
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params from urllib.parse import uses_relative, uses_netloc, uses_params
@ -93,8 +98,21 @@ def download_url(url):
return filepath return filepath
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] common_kwargs = [
lattice_kwargs = [ "flag_size",
"margins",
"split_text",
"strip_text",
"table_areas",
"table_regions"
]
text_kwargs = common_kwargs + [
"columns",
"edge_tol",
"row_tol",
"column_tol"
]
lattice_kwargs = common_kwargs + [
"process_background", "process_background",
"line_scale", "line_scale",
"copy_text", "copy_text",
@ -106,42 +124,72 @@ lattice_kwargs = [
"iterations", "iterations",
"resolution", "resolution",
] ]
flavor_to_kwargs = {
"stream": text_kwargs,
"network": text_kwargs,
"lattice": lattice_kwargs,
"hybrid": text_kwargs + lattice_kwargs,
}
def validate_input(kwargs, flavor="lattice"): def validate_input(kwargs, flavor="lattice"):
def check_intersection(parser_kwargs, input_kwargs): parser_kwargs = flavor_to_kwargs[flavor]
isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) # s.difference(t): new set with elements in s but not in t
isec = set(kwargs.keys()).difference(set(parser_kwargs))
if isec: if isec:
raise ValueError( raise ValueError(
f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'" f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
) )
if flavor == "lattice":
check_intersection(stream_kwargs, kwargs)
else:
check_intersection(lattice_kwargs, kwargs)
def remove_extra(kwargs, flavor="lattice"): def remove_extra(kwargs, flavor="lattice"):
if flavor == "lattice": parser_kwargs = flavor_to_kwargs[flavor]
for key in kwargs.keys(): # Avoid "dictionary changed size during iteration"
if key in stream_kwargs: kwargs_keys = list(kwargs.keys())
kwargs.pop(key) for key in kwargs_keys:
else: if key not in parser_kwargs:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key) kwargs.pop(key)
return kwargs return kwargs
# https://stackoverflow.com/a/22726782 # https://stackoverflow.com/a/22726782
class TemporaryDirectory(object): # and https://stackoverflow.com/questions/10965479
class TemporaryDirectory():
def __init__(self):
self.dir_path = None
def __enter__(self): def __enter__(self):
self.name = tempfile.mkdtemp() self.dir_path = tempfile.mkdtemp()
return self.name # Only delete the temporary directory upon
# program exit.
atexit.register(shutil.rmtree, self.dir_path)
return self.dir_path
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name) pass
def build_file_path_in_temp_dir(filename, extension=None):
"""Generates a new path within a temporary directory
Parameters
----------
filename : str
extension : str
Returns
-------
file_path_in_temporary_dir : str
"""
with TemporaryDirectory() as temp_dir:
if extension:
filename = filename + extension
path = os.path.join(
temp_dir,
filename
)
return path
def translate(x1, x2): def translate(x1, x2):
@ -247,8 +295,9 @@ def scale_image(tables, v_segments, h_segments, factors):
j_x, j_y = zip(*tables[k]) j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x] j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y] j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y) tables_new[(x1, y1, x2, y2)] = {
tables_new[(x1, y1, x2, y2)] = joints "joints": list(zip(j_x, j_y))
}
v_segments_new = [] v_segments_new = []
for v in v_segments: for v in v_segments:
@ -296,9 +345,10 @@ def get_rotation(chars, horizontal_text, vertical_text):
hlen = len([t for t in horizontal_text if t.get_text().strip()]) hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) clockwise = sum(t.matrix[1] < 0 < t.matrix[2] for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 > t.matrix[2] for t in chars)
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" rotation = "anticlockwise" if clockwise < anticlockwise \
else "clockwise"
return rotation return rotation
@ -329,18 +379,98 @@ def segments_in_bbox(bbox, v_segments, h_segments):
v_s = [ v_s = [
v v
for v in v_segments for v in v_segments
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and
lb[0] - 2 <= v[0] <= rt[0] + 2
] ]
h_s = [ h_s = [
h h
for h in h_segments for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and
lb[1] - 2 <= h[1] <= rt[1] + 2
] ]
return v_s, h_s return v_s, h_s
def get_textline_coords(textline):
"""Calculate the coordinates of each alignment for a given textline.
"""
return {
"left": textline.x0,
"right": textline.x1,
"middle": (textline.x0 + textline.x1) / 2.0,
"bottom": textline.y0,
"top": textline.y1,
"center": (textline.y0 + textline.y1) / 2.0,
}
def bbox_from_str(bbox_str):
"""Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
Parameters
----------
bbox_str : str
Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".
Returns
-------
bbox : tuple
Tuple (x1, y1, x2, y2).
"""
x1, y1, x2, y2 = bbox_str.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
return (
min(x1, x2),
min(y1, y2),
max(x1, x2),
max(y1, y2)
)
def bboxes_overlap(bbox1, bbox2):
(left1, bottom1, right1, top1) = bbox1
(left2, bottom2, right2, top2) = bbox2
return (
(left1 < left2 < right1) or (left1 < right2 < right1)
) and (
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
)
def textlines_overlapping_bbox(bbox, textlines):
"""Returns all text objects which overlap or are within a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
textlines : List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects.
"""
t_bbox = [
t
for t in textlines
if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
]
return t_bbox
def text_in_bbox(bbox, text): def text_in_bbox(bbox, text):
"""Returns all text objects present inside a bounding box. """Returns all text objects which lie at least 50% inside a bounding box
across both dimensions.
Parameters Parameters
---------- ----------
@ -367,6 +497,214 @@ def text_in_bbox(bbox, text):
return t_bbox return t_bbox
def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
"""Returns all text objects present inside a bounding box, split between
horizontal and vertical text.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
horizontal_text : List of PDFMiner text objects.
vertical_text : List of PDFMiner text objects.
Returns
-------
t_bbox : dict
Dict of lists of PDFMiner text objects that lie inside table, with one
key each for "horizontal" and "vertical"
"""
t_bbox = {}
t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text)
t_bbox["vertical"] = text_in_bbox(bbox, vertical_text)
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
return t_bbox
def expand_bbox_with_textline(bbox, textline):
"""Expand (if needed) a bbox so that it fits the parameter textline.
"""
return (
min(bbox[0], textline.x0),
min(bbox[1], textline.y0),
max(bbox[2], textline.x1),
max(bbox[3], textline.y1)
)
def bbox_from_textlines(textlines):
"""Returns the smallest bbox containing all the text objects passed as
a parameters.
Parameters
----------
textlines : List of PDFMiner text objects.
Returns
-------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
"""
if len(textlines) == 0:
return None
bbox = (
textlines[0].x0,
textlines[0].y0,
textlines[0].x1,
textlines[0].y1
)
for tl in textlines[1:]:
bbox = expand_bbox_with_textline(bbox, tl)
return bbox
def find_columns_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct cols boundaries for a list of text objects
Parameters
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between columns. Any elements closer than
this threshold are merged together. This is to prevent spaces between
words to be misinterpreted as boundaries.
Returns
-------
boundaries : list
List x-coordinates for cols.
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
"""
cols_bounds = []
tls.sort(key=lambda tl: tl.x0)
for tl in tls:
if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
cols_bounds.append([tl.x0, tl.x1])
else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
return cols_bounds
def find_rows_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct rows boundaries for a list of text objects
Parameters
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between rows. Any elements closer than
this threshold are merged together.
Returns
-------
boundaries : list
List y-coordinates for rows.
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
"""
rows_bounds = []
tls.sort(key=lambda tl: tl.y0)
for tl in tls:
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
rows_bounds.append([tl.y0, tl.y1])
else:
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
return rows_bounds
def boundaries_to_split_lines(boundaries):
"""Find split lines given a list of boundaries between rows or cols.
Boundaries: [ a ] [b] [ c ] [d]
Splits: | | | | |
Parameters
----------
boundaries : list
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
These are the (left, right most) or (bottom, top most) coordinates.
Returns
-------
anchors : list
List of coordinates representing the split points, each half way
between boundaries
"""
# From the row boundaries, identify splits by getting the mid points
# between the boundaries.
anchors = list(map(
lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
range(1, len(boundaries))
))
anchors.insert(0, boundaries[0][0])
anchors.append(boundaries[-1][1])
return anchors
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
"""Return the index of the closest point in the sorted list.
Parameters
----------
point : the reference sortable element to search.
sorted_list : list
fn: optional accessor function
Returns
-------
index : int
"""
n = len(sorted_list)
if n == 0:
return None
if n == 1:
return 0
left = 0
right = n - 1
mid = 0
if point >= fn(sorted_list[n - 1]):
return n - 1
if point <= fn(sorted_list[0]):
return 0
while left < right:
mid = (left + right) // 2 # find the mid
mid_val = fn(sorted_list[mid])
if point < mid_val:
right = mid
elif point > mid_val:
left = mid + 1
else:
return mid
if mid_val > point:
if mid > 0 and (
point - fn(sorted_list[mid-1]) <
mid_val - point):
return mid-1
elif mid_val < point:
if mid < n - 1 and (
fn(sorted_list[mid+1]) - point <
point - mid_val):
return mid+1
return mid
def merge_close_lines(ar, line_tol=2): def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.
@ -452,10 +790,10 @@ def flag_font_size(textline, direction, strip_text=""):
for t in textline for t in textline
if not isinstance(t, LTAnno) if not isinstance(t, LTAnno)
] ]
l = [np.round(size, decimals=6) for text, size in d] text_sizes = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1: if len(set(text_sizes)) > 1:
flist = [] flist = []
min_size = min(l) min_size = min(text_sizes)
for key, chars in groupby(d, itemgetter(1)): for key, chars in groupby(d, itemgetter(1)):
if key == min_size: if key == min_size:
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
@ -469,12 +807,12 @@ def flag_font_size(textline, direction, strip_text=""):
flist.append("".join(fchars)) flist.append("".join(fchars))
fstring = "".join(flist) fstring = "".join(flist)
else: else:
fstring = "".join([t.get_text() for t in textline]) fstring = "".join(t.get_text() for t in textline)
return text_strip(fstring, strip_text) return text_strip(fstring, strip_text)
def split_textline(table, textline, direction, flag_size=False, strip_text=""): def split_textline(table, textline, direction, flag_size=False, strip_text=""):
"""Splits PDFMiner LTTextLine into substrings if it spans across """Split PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns. multiple rows/columns.
Parameters Parameters
@ -499,7 +837,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
of row/column and text is the an lttextline substring. of row/column and text is the an lttextline substring.
""" """
idx = 0
cut_text = [] cut_text = []
bbox = textline.bbox bbox = textline.bbox
try: try:
@ -516,7 +853,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
r = r_idx[0] r = r_idx[0]
x_cuts = [ x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right (c, table.cells[r][c].x2)
for c in x_overlap
if table.cells[r][c].right
] ]
if not x_cuts: if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
@ -530,7 +869,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
): ):
cut_text.append((r, cut[0], obj)) cut_text.append((r, cut[0], obj))
break break
else:
# TODO: add test # TODO: add test
if cut == x_cuts[-1]: if cut == x_cuts[-1]:
cut_text.append((r, cut[0] + 1, obj)) cut_text.append((r, cut[0] + 1, obj))
@ -549,7 +887,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
c = c_idx[0] c = c_idx[0]
y_cuts = [ y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom (r, table.cells[r][c].y1)
for r in y_overlap
if table.cells[r][c].bottom
] ]
if not y_cuts: if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
@ -557,13 +897,10 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
col = table.cols[c] col = table.cols[c]
for cut in y_cuts: for cut in y_cuts:
if isinstance(obj, LTChar): if isinstance(obj, LTChar):
if ( if col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] \
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and (obj.y0 + obj.y1) / 2 >= cut[1]:
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj)) cut_text.append((cut[0], c, obj))
break break
else:
# TODO: add test # TODO: add test
if cut == y_cuts[-1]: if cut == y_cuts[-1]:
cut_text.append((cut[0] - 1, c, obj)) cut_text.append((cut[0] - 1, c, obj))
@ -632,9 +969,8 @@ def get_table_index(
""" """
r_idx, c_idx = [-1] * 2 r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)): for r in range(len(table.rows)):
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
r (t.y0 + t.y1) / 2.0 > table.rows[r][1]:
][1]:
lt_col_overlap = [] lt_col_overlap = []
for c in table.cols: for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0: if c[0] <= t.x1 and c[1] >= t.x0:
@ -648,7 +984,8 @@ def get_table_index(
text_range = (t.x0, t.x1) text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1]) col_range = (table.cols[0][0], table.cols[-1][1])
warnings.warn( warnings.warn(
f"{text} {text_range} does not lie in column range {col_range}" f"{text} {text_range} does not lie in column range "
f"{col_range}"
) )
r_idx = r r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap)) c_idx = lt_col_overlap.index(max(lt_col_overlap))
@ -667,7 +1004,9 @@ def get_table_index(
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y charea = X * Y
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea error = (
(X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
) / charea
if split_text: if split_text:
return ( return (
@ -676,20 +1015,21 @@ def get_table_index(
), ),
error, error,
) )
else:
if flag_size: if flag_size:
return ( return (
[ [
( (
r_idx, r_idx,
c_idx, c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text), flag_font_size(t._objs,
direction,
strip_text=strip_text),
) )
], ],
error, error,
) )
else: return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error error
def compute_accuracy(error_weights): def compute_accuracy(error_weights):
@ -711,7 +1051,7 @@ def compute_accuracy(error_weights):
SCORE_VAL = 100 SCORE_VAL = 100
try: try:
score = 0 score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL: if sum(ew[0] for ew in error_weights) != SCORE_VAL:
raise ValueError("Sum of weights should be equal to 100.") raise ValueError("Sum of weights should be equal to 100.")
for ew in error_weights: for ew in error_weights:
weight = ew[0] / len(ew[1]) weight = ew[0] / len(ew[1])
@ -737,7 +1077,6 @@ def compute_whitespace(d):
""" """
whitespace = 0 whitespace = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d: for i in d:
for j in i: for j in i:
if j.strip() == "": if j.strip() == "":
@ -752,8 +1091,7 @@ def get_page_layout(
line_margin=0.5, line_margin=0.5,
word_margin=0.1, word_margin=0.1,
detect_vertical=True, detect_vertical=True,
all_texts=True, all_texts=True):
):
"""Returns a PDFMiner LTPage object and page dimension of a single """Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs. of kwargs.
@ -797,6 +1135,7 @@ def get_page_layout(
width = layout.bbox[2] width = layout.bbox[2]
height = layout.bbox[3] height = layout.bbox[3]
dim = (width, height) dim = (width, height)
break # we assume a single page pdf
return layout, dim return layout, dim
@ -838,3 +1177,117 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError: except AttributeError:
pass pass
return t return t
def export_pdf_as_png(pdf_path, destination_path, resolution=300):
"""Generate an image from a pdf.
Parameters
----------
pdf_path : str
destination_path : str
resolution : int
"""
gs_call = "-q -sDEVICE=png16m -o " \
"{destination_path} -r{resolution} {pdf_path}" \
.format(
destination_path=destination_path,
resolution=resolution,
pdf_path=pdf_path
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
null.close()
def compare_tables(left, right):
"""Compare two tables and displays differences in a human readable form.
Parameters
----------
left : data frame
right : data frame
"""
diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0]
differences = []
if diff_rows:
differences.append(
"{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows > 0 else 'fewer'
)
)
if diff_cols:
differences.append(
"{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols),
more_fewer='more' if diff_cols > 0 else 'fewer'
)
)
if differences:
differences_str = " and ".join(differences)
print(
"Right has {differences_str} than left "
"{shape_left} vs {shape_right}".format(
differences_str=differences_str,
shape_left=[left.shape[0], left.shape[1]],
shape_right=[right.shape[0], right.shape[1]],
)
)
table1, table2 = [left, right]
name_table1, name_table2 = ["left", "right"]
if not diff_cols:
# Same number of cols: compare rows since they're of the same length
if diff_rows > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for index, lrow in table1.iterrows():
if index < table2.shape[0]:
srow = table2.loc[index, :]
if not lrow.equals(srow):
diff_df = pd.DataFrame()
diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2])
print("Row {index} differs:".format(index=index))
print(diff_df.values)
break
else:
print("Row {index} unique to {name_table1}: {lrow}".format(
index=index,
name_table1=name_table1,
lrow=lrow
))
break
elif not diff_rows:
# Same number of rows: compare columns since they're of the same length
if diff_cols > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for i, col in enumerate(table1.columns):
lcol = table1.iloc[:, i]
if col in table2:
scol = table2.iloc[:, i]
if not lcol.equals(scol):
diff_df = pd.DataFrame()
diff_df[name_table1] = scol
diff_df[name_table2] = lcol
diff_df["Match"] = lcol == scol
print(
"Column {i} different:\n"
"{diff_df}".format(
i=i,
diff_df=diff_df
)
)
break
else:
print("Column {i} unique to {name_table1}: {lcol}")
break
else:
print("Tables have different shapes")

View File

@ -13,7 +13,7 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
$ conda install -c conda-forge camelot-py $ conda install -c conda-forge camelot-py
.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_. .. note:: Camelot is available for Python 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
.. _conda: https://conda.io/docs/ .. _conda: https://conda.io/docs/
.. _Anaconda: http://docs.continuum.io/anaconda/ .. _Anaconda: http://docs.continuum.io/anaconda/

View File

@ -0,0 +1,351 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hybrid Parser step-by-step\n",
"\n",
"This notebook describes the algorithms behind the hybrid parser, which blends the results of the network parser (text based) and the lattice parser (image based).\n",
"\n",
"You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with camelot.\n",
"\n",
"You can also use the `parser-comparison-notebook` notebook to compare the parsers results side-by-side."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Bootstrap and common imports\n",
"import os, sys, time\n",
"sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
"import camelot\n",
"\n",
"print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")\n",
"\n",
"# Select a pdf to analyze.\n",
"kwargs = {}\n",
"data = None\n",
"# pdf_file = \"vertical_header.pdf\" # test_network_vertical_header\n",
"# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n",
"\n",
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_network_flag_size\n",
"# pdf_file = \"health.pdf\" # test_network\n",
"# pdf_file = \"clockwise_table_2.pdf\"\n",
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_network_table_regions\n",
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_areas\": [\"320,500,573,335\"]} # test_network_table_areas\n",
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
"# pdf_file = \"clockwise_table_2.pdf\" # test_network_table_rotated / test_stream_table_rotated\n",
"pdf_file = \"vertical_header.pdf\"\n",
"\n",
"# pdf_file = \"twotables_2.pdf\"\n",
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
"\n",
"filename = os.path.join(\n",
" os.path.dirname(os.path.abspath('.')),\n",
" \"camelot/tests/files\",\n",
" pdf_file\n",
")\n",
"\n",
"# Set up plotting options\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"PLOT_HEIGHT = 12\n",
"def init_figure_and_axis(title):\n",
" fig = plt.figure(figsize=(PLOT_HEIGHT * 2.5, PLOT_HEIGHT))\n",
" ax = fig.add_subplot(111)\n",
" ax.set_title(title)\n",
" return fig, ax\n",
"\n",
"# Utility function to display tables\n",
"def display_parse_results(tables, parse_time, flavor):\n",
" if not tables:\n",
" return\n",
" tables_dims = \", \".join(\n",
" map(\n",
" lambda table: \"{rows}x{cols}\".format(\n",
" rows=table.shape[0],\n",
" cols=table.shape[1],\n",
" ), tables\n",
" )\n",
" )\n",
" print(f\"The {flavor} parser found {len(tables)} table(s) ({tables_dims}) in {parse_time:.2f}s\")\n",
" for table in tables:\n",
" display(table.df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Overall Algorithm\n",
"\n",
"The hybrid parser combines results from the network parser and the lattice parser to get the \"best of both worlds.\" Before we look at the combination itself, let's see how each of the two parsers work.\n",
"\n",
"### Network parser\n",
"\n",
"The network parser is text-based: it relies on the bounding boxes of the text elements encoded in the .pdf document to identify patterns indicative of a table.\n",
"\n",
"The plot belows shows the bounding boxes of all the text elements on the parsed document, in light blue for horizontal elements, light red for vertical elements (rare in most documents)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parse file\n",
"flavor = \"network\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"if tables:\n",
" fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n",
" camelot.plot(tables[0], kind=\"text\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Network parser - step 1: Identify a network of connected alignments\n",
"\n",
"The network parser starts by identifying common horizontal (shown in green on the plot below) or vertical (in blue) coordinate alignments across these text elements. In other words it looks for bounding box rectangles which either share the same top, center, or bottom coordinates (horizontal axis), or the same left, right, or middle coordinates (vertical axis). See the `generate` method.\n",
"\n",
"Once the parser found these alignments, it performs some pruning to only keep text elements that are part of a network - they have connections along both axis The idea is that it's not enough for two elements to be aligned to belong to a table, for instance the lines of text in this paragraph are all left-aligned, but they do not form a network. The pruning is done iteratively, see `remove_unconnected_edges` method.\n",
"\n",
"Once the network is pruned, the parser keeps track of how many alignments each text element belongs to: that's the number on top (vertical alignments) or to the left of each alignment in the plot below. The text element with the most connections (in red on the plot) is the starting point -the *seed*- of the next step. Finally, the parser measures how far the alignments are from one another, to determine a plausible search zone around each cell for the next stage of growing the table. See `compute_plausible_gaps` method."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if tables:\n",
" fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n",
" camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n",
"else:\n",
" print(f\"No table found for document {pdf_file}.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Network parser - step 2: Detect table body iteratively from seed\n",
"\n",
"In the next step, the parser iteratively \"grows\" a table, starting from the seed identified in the previous step. The bounding box is initialized with the bounding box of the seed, then it iteratively searches for text elements that are close to the bounding box, then grows the table to ingest them, until there are no more text elements to ingest. The two steps are:\n",
"* Search: create a search bounding box by expanding the current table bounding box in all directions, based on the plausible gap numbers determined above. Search bounding boxes are shown in orange on the graph below. \n",
"* Grow: if a networked text element is found in this search area, expand the table bounding box so that it includes this new element. Each successive table bounding box is shown in red in the plot below.\n",
"\n",
"Notice in the plot below how the search area and the table bounding box grow starting from the seed. See method `search_table_body`.\n",
"\n",
"#### Network parser - step 3: Search for a header section\n",
"\n",
"Headers are often aligned differently from the rest of the table. To account for this, the network parser searches for text elements that are good candidates for a header section: these text elements are just above the bounding box of the body of the table, and they fit within the rows identified in the table body. See the method `search_header_from_body_bbox`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if tables:\n",
" fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n",
" camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Network parser - step 4: Repeat\n",
"\n",
"There are sometimes multiple tables on one page. So once a first table is identified, all the text edges it contains are removed, and the algorithm is repeated until no new network is identified.\n",
"\n",
"The final parse for this .pdf is as follows:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lattice parser\n",
"\n",
"The lattice parser is based on an analyzis of the image from the .pdf, rather than its text content. It relies on the borders of the tables to be solid vertical lines.\n",
"\n",
"#### Lattice parser - step 1: Identify solid lines within the document.\n",
"\n",
"The lattice parser relies on the OpenCV library (`getStructuringElement` function) to detect all solid vertical and horizontal lines within the document."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parse file\n",
"flavor = \"lattice\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"if tables:\n",
" fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n",
" camelot.plot(tables[0], kind=\"line\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Lattice parser - step 2: Find the contours of the table(s) based on the solid lines.\n",
"\n",
"The lattice parser then uses OpenCV's `findContours` function to detect the overall bounding box of the table(s), since the solid lines might draw more than one table."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for table in tables:\n",
" fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n",
" camelot.plot(table, kind=\"contour\", ax=ax)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Lattice parser - step 3: Identify joints\n",
"\n",
"For each table bounding box (contour), the lattice parser then makes a list of all the intersections between vertical and horizontal lines: the joints."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for table in tables:\n",
" fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n",
" camelot.plot(table, kind=\"joint\", ax=ax)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Lattice parser - step 4: Identify rows and columns\n",
"\n",
"In the final step, the algorithm sorts all the x coordinates of the joints to identify the position of the table's columns, and the y coordinates for the table's rows. See method `_generate_columns_and_rows`.\n",
"\n",
"The resulting lattice parse for the .pdf is as follows."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combining results of Network and Lattice with the Hybrid parser\n",
"\n",
"The hybrid parser aims to combine the strengths of the Network parser (identifying cells based on text alignments) and of the Lattice parser (relying on solid lines to determine tables rows and columns boundaries).\n",
"\n",
"#### Hybrid parser - step 1: Apply both parsers table bounding box detection techniques to the document\n",
"\n",
"In this step, hybrid calls both parsers, to get a) the standard table parse, b) the coordinates of the rows and columns boundaries, and c) the table boundaries (or contour).\n",
"\n",
"#### Hybrid parser - step 2: Merge the results\n",
"\n",
"If there are areas in the document where both lattice and network found a table, the hybrid parser uses the results from network, but enhances them based on the rows/columns boundaries identified by lattice in the area. Because lattice uses the solid lines detected on the document, the coordinates for b) and c) detected by Lattice are generally more precise. See the `_merge_bbox_analysis` method.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"flavor = \"hybrid\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"version": "3.7.7-final"
},
"orig_nbformat": 2,
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3,
"kernelspec": {
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
"display_name": "Python 3.7.7 64-bit"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,201 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parser comparison\n",
"\n",
"This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Bootstrap and common imports\n",
"import os, sys, time\n",
"sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
"import camelot\n",
"\n",
"print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select a PDF file to review\n",
"\n",
"This is seeded with the unit test files for convenience."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kwargs = {}\n",
"data = None\n",
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
"# pdf_file = \"health.pdf\" # test_hybrid\n",
"# pdf_file = \"clockwise_table_2.pdf\"\n",
"\n",
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
"\n",
"# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n",
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
"pdf_file = \"vertical_header.pdf\"\n",
"\n",
"# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
"\n",
"# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
"# pdf_file = \"twotables_2.pdf\" # Lattice is better\n",
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
"# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n",
"# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True} # test_lattice_process_background\n",
"\n",
"filename = os.path.join(\n",
" os.path.dirname(os.path.abspath('.')),\n",
" \"camelot/tests/files\",\n",
" pdf_file\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
"tables_parsed = {}\n",
"parses = {}\n",
"max_tables = 0\n",
"for idx, flavor in enumerate(FLAVORS):\n",
" timer_before_parse = time.perf_counter()\n",
" error, tables = None, []\n",
" try:\n",
" tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
" except ValueError as value_error:\n",
" error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
" print(error)\n",
" timer_after_parse = time.perf_counter()\n",
" max_tables = max(max_tables, len(tables))\n",
"\n",
" parses[flavor] = {\n",
" \"tables\": tables,\n",
" \"time\": timer_after_parse - timer_before_parse,\n",
" \"error\": error\n",
" }\n",
"\n",
" print(f\"##### {flavor} ####\")\n",
" print(f\"Found {len(tables)} table(s):\")\n",
" for idx, table in enumerate(tables):\n",
" flavors_matching = []\n",
" for previous_flavor, previous_tables in tables_parsed.items():\n",
" for prev_idx, previous_table in enumerate(previous_tables):\n",
" if previous_table.df.equals(table.df):\n",
" flavors_matching.append(\n",
" f\"{previous_flavor} table {prev_idx}\")\n",
" print(f\"## Table {idx} ##\")\n",
" if flavors_matching:\n",
" print(f\"Same as {', '.join(flavors_matching)}.\")\n",
" else:\n",
" display(table.df)\n",
" print(\"\")\n",
" tables_parsed[flavor] = tables\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show tables layout within original document"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"\n",
"# Set up plotting options\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"PLOT_HEIGHT = 12\n",
"\n",
"row_count = max(max_tables, 1)\n",
"plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n",
"fig, axes = plt.subplots(row_count, len(FLAVORS))\n",
"plt.subplots_adjust(wspace=0, hspace=0) # Reduce margins to maximize the display zone\n",
"\n",
"fig.suptitle('Side-by-side flavor comparison', fontsize=24, fontweight='bold')\n",
"for idx, flavor in enumerate(FLAVORS):\n",
" parse = parses[flavor]\n",
" tables = parse[\"tables\"]\n",
" top_ax = axes.flat[idx]\n",
" title = f\"{flavor}\\n\" \\\n",
" f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n",
" if parse['error']:\n",
" title = title + f\"\\nError parsing: {parse['error']}\"\n",
" top_ax.set_title(title, fontsize=12, fontweight='bold')\n",
" for table_idx, table in enumerate(tables):\n",
" if max_tables > 1:\n",
" ax = axes[table_idx][idx]\n",
" else:\n",
" ax = axes[idx]\n",
" fig = camelot.plot(table, kind='grid', ax=ax)\n",
" ax.text(\n",
" 0.5,-0.1, \n",
" \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n",
" flavor=flavor,\n",
" table_idx=table_idx,\n",
" rows=table.shape[0],\n",
" cols=table.shape[1],\n",
" ), \n",
" size=14, ha=\"center\", \n",
" transform=ax.transAxes\n",
" )\n",
" timer_after_plot = time.perf_counter()"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"version": "3.7.7-final"
},
"orig_nbformat": 2,
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3,
"kernelspec": {
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
"display_name": "Python 3.7.7 64-bit"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -5,6 +5,6 @@ numpy>=1.13.3
opencv-python>=3.4.2.17 opencv-python>=3.4.2.17
openpyxl>=2.5.8 openpyxl>=2.5.8
pandas>=0.23.4 pandas>=0.23.4
pdfminer.six>=20170720 pdfminer.six>=20200402
PyPDF2>=1.26.0 PyPDF2>=1.26.0
Sphinx>=1.7.9 Sphinx>=1.7.9

View File

@ -3,4 +3,6 @@ test=pytest
[tool:pytest] [tool:pytest]
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
# Switch to no-cov if you want to debug a test with breakpoints.
# addopts = --verbose --mpl
python_files = tests/test_*.py python_files = tests/test_*.py

View File

@ -19,7 +19,7 @@ requires = [
'numpy>=1.13.3', 'numpy>=1.13.3',
'openpyxl>=2.5.8', 'openpyxl>=2.5.8',
'pandas>=0.23.4', 'pandas>=0.23.4',
'pdfminer.six>=20170720', 'pdfminer.six>=20200402',
'PyPDF2>=1.26.0' 'PyPDF2>=1.26.0'
] ]
@ -32,12 +32,12 @@ plot_requires = [
] ]
dev_requires = [ dev_requires = [
'codecov>=2.0.15', 'codecov>=2.1.3',
'pytest>=3.8.0', 'pytest>=4.6',
'pytest-cov>=2.6.0', 'pytest-cov>=2.10.0',
'pytest-mpl>=0.10', 'pytest-mpl>=0.11',
'pytest-runner>=4.2', 'pytest-runner>=5.2',
'Sphinx>=1.7.9' 'Sphinx>=3.0.3'
] ]
all_requires = cv_requires + plot_requires all_requires = cv_requires + plot_requires
@ -69,7 +69,7 @@ def setup_package():
}, },
classifiers=[ classifiers=[
# Trove classifiers # Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.9 KiB

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

View File

@ -19,10 +19,16 @@ def test_help_output():
output = result.output output = result.output
assert prog_name == "camelot" assert prog_name == "camelot"
assert result.output.startswith("Usage: %(prog_name)s [OPTIONS] COMMAND" % locals()) assert result.output.startswith(
"Usage: %(prog_name)s [OPTIONS] COMMAND" %
locals()
)
assert all( assert all(
v in result.output v in result.output
for v in ["Options:", "--version", "--help", "Commands:", "lattice", "stream"] for v in [
"Options:", "--version", "--help", "Commands:", "lattice",
"stream"
]
) )
@ -66,6 +72,26 @@ def test_cli_stream():
assert format_error in result.output assert format_error in result.output
def test_cli_network():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "budget.pdf")
outfile = os.path.join(tempdir, "budget.csv")
runner = CliRunner()
result = runner.invoke(
cli, ["--format", "csv", "--output", outfile, "network", infile]
)
assert result.exit_code == 0
assert result.output == "Found 1 tables\n"
result = runner.invoke(cli, ["--format", "csv", "network", infile])
output_error = "Error: Please specify output file path using --output"
assert output_error in result.output
result = runner.invoke(cli, ["--output", outfile, "network", infile])
format_error = "Please specify output file format using --format"
assert format_error in result.output
def test_cli_password(): def test_cli_password():
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "health_protected.pdf") infile = os.path.join(testdir, "health_protected.pdf")
@ -121,7 +147,8 @@ def test_cli_output_format():
outfile = os.path.join(tempdir, "health.json") outfile = os.path.join(tempdir, "health.json")
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "json", "--output", outfile, "stream", infile], ["--format", "json", "--output", outfile.format("json"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
@ -129,7 +156,8 @@ def test_cli_output_format():
outfile = os.path.join(tempdir, "health.xlsx") outfile = os.path.join(tempdir, "health.xlsx")
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "excel", "--output", outfile, "stream", infile], ["--format", "excel", "--output", outfile.format("xlsx"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
@ -137,7 +165,8 @@ def test_cli_output_format():
outfile = os.path.join(tempdir, "health.html") outfile = os.path.join(tempdir, "health.html")
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "html", "--output", outfile, "stream", infile], ["--format", "html", "--output", outfile.format("html"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
@ -170,6 +199,10 @@ def test_cli_quiet():
assert "No tables found on page-1" in result.output assert "No tables found on page-1" in result.output
result = runner.invoke( result = runner.invoke(
cli, ["--quiet", "--format", "csv", "--output", outfile, "stream", infile] cli,
[
"--quiet", "--format", "csv", "--output", outfile, "stream",
infile
]
) )
assert "No tables found on page-1" not in result.output assert "No tables found on page-1" not in result.output

View File

@ -8,15 +8,20 @@ from pandas.testing import assert_frame_equal
import camelot import camelot
from camelot.core import Table, TableList from camelot.core import Table, TableList
from camelot.__version__ import generate_version from camelot.__version__ import generate_version
# compare_tables used in console mode while debugging
from camelot.utils import compare_tables # noqa
from .data import * from .data import *
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
def test_parsing_report(): def test_parsing_report():
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} parsing_report = {
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
}
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
@ -28,9 +33,11 @@ def test_password():
filename = os.path.join(testdir, "health_protected.pdf") filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream") tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@ -143,6 +150,194 @@ def test_stream_layout_kwargs():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
def test_network():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_network_table_rotated():
df = pd.DataFrame(data_network_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_network_two_tables_a():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
# Reported as https://github.com/camelot-dev/camelot/issues/132
def test_network_two_tables_b():
df1 = pd.DataFrame(data_network_two_tables_b_1)
df2 = pd.DataFrame(data_network_two_tables_b_2)
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_network_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_network_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_network_table_regions():
df = pd.DataFrame(data_network_table_regions)
filename = os.path.join(testdir, "tabula/us-007.pdf")
# The "stream" test looks for a region in ["320,460,573,335"], which
# should exclude the header.
tables = camelot.read_pdf(
filename, flavor="network", table_regions=["320,335,573,505"]
)
assert_frame_equal(df, tables[0].df)
def test_network_table_areas():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="network", table_areas=["320,500,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_network_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
)
assert_frame_equal(df, tables[0].df)
def test_network_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="network",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
def test_network_flag_size():
df = pd.DataFrame(data_network_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
assert_frame_equal(df, tables[0].df)
def test_network_strip_text():
df = pd.DataFrame(data_network_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)
def test_network_edge_tol():
df = pd.DataFrame(data_network_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
assert_frame_equal(df, tables[0].df)
def test_network_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename, flavor="network", layout_kwargs={"detect_vertical": False}
)
assert_frame_equal(df, tables[0].df)
# Hybrid parser
def test_hybrid():
df = pd.DataFrame(data_hybrid)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_hybrid_process_background():
df = pd.DataFrame(data_hybrid_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", process_background=True)
assert_frame_equal(df, tables[1].df)
def test_hybrid_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="hybrid",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
# Lattice parser tests
def test_lattice(): def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)
@ -229,9 +424,9 @@ def test_repr():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_pages(): def test_pages():
@ -239,22 +434,23 @@ def test_pages():
tables = camelot.read_pdf(url) tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="1-end") tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="all") tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -264,7 +460,8 @@ def test_url():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -284,7 +481,12 @@ def test_table_order():
return t return t
table_list = TableList( table_list = TableList(
[_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)] [
_make_table(2, 1),
_make_table(1, 1),
_make_table(3, 4),
_make_table(1, 2)
]
) )
assert [(t.page, t.order) for t in sorted(table_list)] == [ assert [(t.page, t.order) for t in sorted(table_list)] == [

View File

@ -14,32 +14,33 @@ filename = os.path.join(testdir, "foo.pdf")
def test_unknown_flavor(): def test_unknown_flavor():
message = "Unknown flavor specified." " Use either 'lattice' or 'stream'" message = ("Unknown flavor specified."
" Use either 'lattice', 'stream', or 'network'")
with pytest.raises(NotImplementedError, match=message): with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename, flavor="chocolate") camelot.read_pdf(filename, flavor='chocolate')
def test_input_kwargs(): def test_input_kwargs():
message = "columns cannot be used with flavor='lattice'" message = "columns cannot be used with flavor='lattice'"
with pytest.raises(ValueError, match=message): with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf(filename, columns=["10,20,30,40"]) camelot.read_pdf(filename, columns=['10,20,30,40'])
def test_unsupported_format(): def test_unsupported_format():
message = "File format not supported" message = "File format not supported"
filename = os.path.join(testdir, "foo.csv") filename = os.path.join(testdir, "foo.csv")
with pytest.raises(NotImplementedError, match=message): with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename) camelot.read_pdf(filename)
def test_stream_equal_length(): def test_stream_equal_length():
message = "Length of table_areas and columns" " should be equal" message = "Length of table_areas and columns" " should be equal"
with pytest.raises(ValueError, match=message): with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf( camelot.read_pdf(
filename, filename,
flavor="stream", flavor='stream',
table_areas=["10,20,30,40"], table_areas=['10,20,30,40'],
columns=["10,20,30,40", "10,20,30,40"], columns=['10,20,30,40', '10,20,30,40']
) )
@ -48,11 +49,9 @@ def test_image_warning():
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("error") warnings.simplefilter("error")
with pytest.raises(UserWarning) as e: with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename) camelot.read_pdf(filename)
assert ( assert str(e.value) == 'page-1 is image-based, camelot only works ' \
str(e.value) 'on text-based pages.'
== "page-1 is image-based, camelot only works on text-based pages."
)
def test_no_tables_found(): def test_no_tables_found():
@ -60,8 +59,8 @@ def test_no_tables_found():
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("error") warnings.simplefilter("error")
with pytest.raises(UserWarning) as e: with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename) camelot.read_pdf(filename)
assert str(e.value) == "No tables found on page-1" assert str(e.value) == 'No tables found on page-1'
def test_no_tables_found_logs_suppressed(): def test_no_tables_found_logs_suppressed():
@ -70,7 +69,7 @@ def test_no_tables_found_logs_suppressed():
# the test should fail if any warning is thrown # the test should fail if any warning is thrown
warnings.simplefilter("error") warnings.simplefilter("error")
try: try:
tables = camelot.read_pdf(filename, suppress_stdout=True) camelot.read_pdf(filename, suppress_stdout=True)
except Warning as e: except Warning as e:
warning_text = str(e) warning_text = str(e)
pytest.fail(f"Unexpected warning: {warning_text}") pytest.fail(f"Unexpected warning: {warning_text}")
@ -82,7 +81,7 @@ def test_no_tables_found_warnings_suppressed():
# the test should fail if any warning is thrown # the test should fail if any warning is thrown
warnings.simplefilter("error") warnings.simplefilter("error")
try: try:
tables = camelot.read_pdf(filename, suppress_stdout=True) camelot.read_pdf(filename, suppress_stdout=True)
except Warning as e: except Warning as e:
warning_text = str(e) warning_text = str(e)
pytest.fail(f"Unexpected warning: {warning_text}") pytest.fail(f"Unexpected warning: {warning_text}")
@ -92,11 +91,11 @@ def test_no_password():
filename = os.path.join(testdir, "health_protected.pdf") filename = os.path.join(testdir, "health_protected.pdf")
message = "file has not been decrypted" message = "file has not been decrypted"
with pytest.raises(Exception, match=message): with pytest.raises(Exception, match=message):
tables = camelot.read_pdf(filename) camelot.read_pdf(filename)
def test_bad_password(): def test_bad_password():
filename = os.path.join(testdir, "health_protected.pdf") filename = os.path.join(testdir, "health_protected.pdf")
message = "file has not been decrypted" message = "file has not been decrypted"
with pytest.raises(Exception, match=message): with pytest.raises(Exception, match=message):
tables = camelot.read_pdf(filename, password="wrongpass") camelot.read_pdf(filename, password='wrongpass')

View File

@ -3,58 +3,144 @@
import os import os
import pytest import pytest
import matplotlib
import camelot import camelot
# The version of Matplotlib has an impact on some of the tests. Unfortunately,
# we can't enforce usage of a recent version of MatplotLib without dropping
# support for Python 3.6.
# To check the version of matplotlib installed:
# pip freeze | grep matplotlib
# To force upgrade:
# pip install --upgrade --force-reinstall matplotlib
# To force usage of a Python 3.6 compatible version:
# pip install "matplotlib==3.0.3"
# This condition can be removed in favor of a version requirement bump for
# matplotlib once support for Python 3.5 is dropped.
LEGACY_MATPLOTLIB = matplotlib.__version__ < "3.2.1"
# Bump the default plot tolerance from 2 to account for cross-platform testing
# via Travis, and resulting minor font changes.
TOLERANCE = 4
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def unit_test_stable_plot(table, kind):
if not LEGACY_MATPLOTLIB:
# See https://matplotlib.org/3.2.1/users/whats_new.html#kerning-adjustments-now-use-correct-values # noqa
matplotlib.rcParams["text.kerning_factor"] = 6
return camelot.plot(table, kind=kind)
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_text_plot(): def test_text_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind="text") return unit_test_stable_plot(tables[0], 'text')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_grid_plot(): def test_grid_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind="grid") return unit_test_stable_plot(tables[0], 'grid')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_stream_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
return unit_test_stable_plot(tables[0], 'grid')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_network_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="network")
return unit_test_stable_plot(tables[0], 'grid')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_lattice_contour_plot(): def test_lattice_contour_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind="contour") return unit_test_stable_plot(tables[0], 'contour')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_stream_contour_plot(): def test_stream_contour_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf") filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="stream") tables = camelot.read_pdf(filename, flavor='stream')
return camelot.plot(tables[0], kind="contour") return unit_test_stable_plot(tables[0], 'contour')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_network_contour_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='network')
return unit_test_stable_plot(tables[0], 'contour')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_line_plot(): def test_line_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind="line") return unit_test_stable_plot(tables[0], 'line')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_joint_plot(): def test_joint_plot():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind="joint") return unit_test_stable_plot(tables[0], 'joint')
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(
def test_textedge_plot(): baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_stream_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf") filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="stream") tables = camelot.read_pdf(filename, flavor='stream')
return camelot.plot(tables[0], kind="textedge") return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_network_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, debug=True, flavor='network')
return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_network_table_regions_textedge_plot():
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, debug=True, flavor="network",
table_regions=["320,505,573,330"]
)
return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_network_table_areas_text_plot():
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, debug=True, flavor="network",
table_areas=["320,500,573,335"]
)
return unit_test_stable_plot(tables[0], 'text')