pull/153/head
Frh 2020-04-25 22:47:23 -07:00
parent f7aafcd05c
commit e1572a10c9
8 changed files with 141 additions and 124 deletions

View File

@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO)
class Config(object):
class Config():
def __init__(self):
self.config = {}

View File

@ -31,7 +31,7 @@ VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class TextAlignment(object):
class TextAlignment():
"""Represents a list of textlines sharing an alignment on a coordinate.
The alignment can be left/right/middle or top/bottom/center.
@ -137,7 +137,7 @@ class TextEdge(TextAlignment):
self.is_valid = True
class TextAlignments(object):
class TextAlignments():
"""Defines a dict of text edges across reference alignments.
"""
@ -327,7 +327,7 @@ class TextEdges(TextAlignments):
return table_areas_padded
class Cell(object):
class Cell():
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space)
@ -409,7 +409,7 @@ class Cell(object):
return self.top + self.bottom + self.left + self.right
class Table(object):
class Table():
"""Defines a table with coordinates relative to a left-bottom
origin. (PDF coordinate space)
@ -815,7 +815,7 @@ class Table(object):
return self
class TableList(object):
class TableList():
"""Defines a list of camelot.core.Table objects. Each table can
be accessed using its index.

View File

@ -19,7 +19,7 @@ from ..utils import (
from ..core import Table
class BaseParser(object):
class BaseParser():
"""Defines a base parser.
"""
def __init__(
@ -32,11 +32,11 @@ class BaseParser(object):
strip_text="",
shift_text=None,
flag_size=False,
debug=False
):
debug=False):
self.id = parser_id
self.table_regions = table_regions
self.table_areas = table_areas
self.table_bbox = {}
self.copy_text = copy_text
self.split_text = split_text
@ -49,7 +49,9 @@ class BaseParser(object):
self.t_bbox = None
# For plotting details of parsing algorithms
self.parse_details = {} if debug else None
self.parse_details = {}
if not debug:
self.parse_details = None
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
@ -177,6 +179,18 @@ class BaseParser(object):
table.cells[r_idx][c_idx].text = text
return pos_errors
def _generate_columns_and_rows(self, bbox, table_idx):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table(self, table_idx, cols, rows, **kwargs):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table_bbox(self):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def extract_tables(self):
if self._document_has_no_text():
return []
@ -188,7 +202,11 @@ class BaseParser(object):
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
sorted(
self.table_bbox.keys(),
key=lambda x: x[1],
reverse=True
)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox,
@ -244,8 +262,7 @@ class TextBaseParser(BaseParser):
row_tol=2,
column_tol=0,
debug=False,
**kwargs
):
**kwargs):
super().__init__(
parser_id,
table_regions=table_regions,

View File

@ -3,9 +3,9 @@
from __future__ import division
import numpy as np
import copy
import math
import numpy as np
from .base import TextBaseParser
from ..core import (
@ -16,6 +16,7 @@ from ..core import (
)
from ..utils import (
bbox_from_str,
expand_bbox_with_textline,
text_in_bbox,
bbox_from_textlines,
distance_tl_to_bbox,
@ -25,6 +26,23 @@ from ..utils import (
# maximum number of columns over which a header can spread
MAX_COL_SPREAD_IN_HEADER = 3
# Minimum number of textlines in a table
MINIMUM_TEXTLINES_IN_TABLE = 6
def column_spread(left, right, col_anchors):
"""Get the number of columns crossed by a segment [left, right]."""
index_left = 0
while index_left < len(col_anchors) \
and col_anchors[index_left] < left:
index_left += 1
index_right = index_left
while index_right < len(col_anchors) \
and col_anchors[index_right] < right:
index_right += 1
return index_right - index_left
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
"""Expand a bbox vertically up by looking for plausible headers.
@ -40,19 +58,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
(left, bottom, right, top) = body_bbox
zones = []
def column_spread(left, right, col_anchors):
"""Get the number of columns crossed by a segment [left, right]."""
indexLeft = 0
while indexLeft < len(col_anchors) \
and col_anchors[indexLeft] < left:
indexLeft += 1
indexRight = indexLeft
while indexRight < len(col_anchors) \
and col_anchors[indexRight] < right:
indexRight += 1
return indexRight - indexLeft
keep_searching = True
while keep_searching:
keep_searching = False
@ -128,8 +133,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
# This is to avoid picking unrelated paragraphs.
if max_spread <= min(
MAX_COL_SPREAD_IN_HEADER,
math.ceil(len(col_anchors) / 2)
):
math.ceil(len(col_anchors) / 2)):
# Combined, the elements we've identified don't cross more
# than the authorized number of columns.
# We're trying to avoid
@ -145,7 +149,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
return new_bbox
class AlignmentCounter(object):
class AlignmentCounter():
"""
For a given textline, represent all other textlines aligned with it.
@ -260,7 +264,7 @@ class TextNetworks(TextAlignments):
removed_singletons = True
while removed_singletons:
removed_singletons = False
for alignment_id, textalignments in self._text_alignments.items():
for textalignments in self._text_alignments.values():
# For each alignment edge, remove items if they are singletons
# either horizontally or vertically
for ta in textalignments:
@ -308,8 +312,8 @@ class TextNetworks(TextAlignments):
# Retrieve the list of textlines it's aligned with, across both
# axis
best_alignment = self._textline_to_alignments[most_aligned_tl]
ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
__, ref_h_textlines = best_alignment.max_h()
__, ref_v_textlines = best_alignment.max_v()
if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
return None
@ -375,7 +379,6 @@ class TextNetworks(TextAlignments):
else:
parse_details_search = None
MINIMUM_TEXTLINES_IN_TABLE = 6
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1)
@ -402,12 +405,7 @@ class TextNetworks(TextAlignments):
# if the textline is close.
if h_distance < max_h_gap and v_distance < max_v_gap:
tls_in_bbox.append(tl)
bbox = (
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
bbox = expand_bbox_with_textline(bbox, tl)
del tls_search_space[i]
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
return bbox
@ -472,8 +470,7 @@ class Hybrid(TextBaseParser):
row_tol=2,
column_tol=0,
debug=False,
**kwargs
):
**kwargs):
super().__init__(
"hybrid",
table_regions=table_regions,

View File

@ -100,8 +100,7 @@ class Lattice(BaseParser):
threshold_constant=-2,
iterations=0,
resolution=300,
**kwargs
):
**kwargs):
super().__init__(
"lattice",
table_regions=table_regions,

View File

@ -60,8 +60,7 @@ class Stream(TextBaseParser):
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
**kwargs):
super().__init__(
"stream",
table_regions=table_regions,

View File

@ -136,7 +136,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
return ax
class PlotMethods(object):
class PlotMethods():
def __call__(self, table, kind="text", filename=None, ax=None):
"""Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different

View File

@ -156,7 +156,7 @@ def remove_extra(kwargs, flavor="lattice"):
# https://stackoverflow.com/a/22726782
# and https://stackoverflow.com/questions/10965479
class TemporaryDirectory(object):
class TemporaryDirectory():
def __enter__(self):
self.name = tempfile.mkdtemp()
# Only delete the temporary directory upon
@ -488,6 +488,17 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
return t_bbox
def expand_bbox_with_textline(bbox, textline):
"""Expand (if needed) a bbox so that it fits the parameter textline.
"""
return (
min(bbox[0], textline.x0),
min(bbox[1], textline.y0),
max(bbox[2], textline.x1),
max(bbox[3], textline.y1)
)
def bbox_from_textlines(textlines):
"""Returns the smallest bbox containing all the text objects passed as
a parameters.
@ -514,12 +525,7 @@ def bbox_from_textlines(textlines):
)
for tl in textlines[1:]:
bbox = (
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
bbox = expand_bbox_with_textline(bbox, tl)
return bbox
@ -1044,8 +1050,7 @@ def get_page_layout(
line_margin=0.5,
word_margin=0.1,
detect_vertical=True,
all_texts=True,
):
all_texts=True):
"""Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs.
@ -1163,14 +1168,14 @@ def compare_tables(left, right):
diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0]
differences = []
if (diff_rows):
if diff_rows:
differences.append(
"{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows > 0 else 'fewer'
)
)
if (diff_cols):
if diff_cols:
differences.append(
"{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols),