pull/153/head
Frh 2020-04-25 22:47:23 -07:00
parent f7aafcd05c
commit e1572a10c9
8 changed files with 141 additions and 124 deletions

View File

@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class Config(object): class Config():
def __init__(self): def __init__(self):
self.config = {} self.config = {}

View File

@ -31,7 +31,7 @@ VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class TextAlignment(object): class TextAlignment():
"""Represents a list of textlines sharing an alignment on a coordinate. """Represents a list of textlines sharing an alignment on a coordinate.
The alignment can be left/right/middle or top/bottom/center. The alignment can be left/right/middle or top/bottom/center.
@ -137,7 +137,7 @@ class TextEdge(TextAlignment):
self.is_valid = True self.is_valid = True
class TextAlignments(object): class TextAlignments():
"""Defines a dict of text edges across reference alignments. """Defines a dict of text edges across reference alignments.
""" """
@ -327,7 +327,7 @@ class TextEdges(TextAlignments):
return table_areas_padded return table_areas_padded
class Cell(object): class Cell():
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space) left-bottom origin. (PDF coordinate space)
@ -409,7 +409,7 @@ class Cell(object):
return self.top + self.bottom + self.left + self.right return self.top + self.bottom + self.left + self.right
class Table(object): class Table():
"""Defines a table with coordinates relative to a left-bottom """Defines a table with coordinates relative to a left-bottom
origin. (PDF coordinate space) origin. (PDF coordinate space)
@ -815,7 +815,7 @@ class Table(object):
return self return self
class TableList(object): class TableList():
"""Defines a list of camelot.core.Table objects. Each table can """Defines a list of camelot.core.Table objects. Each table can
be accessed using its index. be accessed using its index.

View File

@ -19,7 +19,7 @@ from ..utils import (
from ..core import Table from ..core import Table
class BaseParser(object): class BaseParser():
"""Defines a base parser. """Defines a base parser.
""" """
def __init__( def __init__(
@ -32,11 +32,11 @@ class BaseParser(object):
strip_text="", strip_text="",
shift_text=None, shift_text=None,
flag_size=False, flag_size=False,
debug=False debug=False):
):
self.id = parser_id self.id = parser_id
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.table_bbox = {}
self.copy_text = copy_text self.copy_text = copy_text
self.split_text = split_text self.split_text = split_text
@ -49,7 +49,9 @@ class BaseParser(object):
self.t_bbox = None self.t_bbox = None
# For plotting details of parsing algorithms # For plotting details of parsing algorithms
self.parse_details = {} if debug else None self.parse_details = {}
if not debug:
self.parse_details = None
def prepare_page_parse(self, filename, layout, dimensions, def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs): page_idx, layout_kwargs):
@ -177,6 +179,18 @@ class BaseParser(object):
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
return pos_errors return pos_errors
def _generate_columns_and_rows(self, bbox, table_idx):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table(self, table_idx, cols, rows, **kwargs):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def _generate_table_bbox(self):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
def extract_tables(self): def extract_tables(self):
if self._document_has_no_text(): if self._document_has_no_text():
return [] return []
@ -188,7 +202,11 @@ class BaseParser(object):
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, bbox in enumerate( for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) sorted(
self.table_bbox.keys(),
key=lambda x: x[1],
reverse=True
)
): ):
cols, rows, v_s, h_s = self._generate_columns_and_rows( cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox, bbox,
@ -244,8 +262,7 @@ class TextBaseParser(BaseParser):
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
debug=False, debug=False,
**kwargs **kwargs):
):
super().__init__( super().__init__(
parser_id, parser_id,
table_regions=table_regions, table_regions=table_regions,

View File

@ -3,9 +3,9 @@
from __future__ import division from __future__ import division
import numpy as np
import copy import copy
import math import math
import numpy as np
from .base import TextBaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
@ -16,6 +16,7 @@ from ..core import (
) )
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
expand_bbox_with_textline,
text_in_bbox, text_in_bbox,
bbox_from_textlines, bbox_from_textlines,
distance_tl_to_bbox, distance_tl_to_bbox,
@ -25,6 +26,23 @@ from ..utils import (
# maximum number of columns over which a header can spread # maximum number of columns over which a header can spread
MAX_COL_SPREAD_IN_HEADER = 3 MAX_COL_SPREAD_IN_HEADER = 3
# Minimum number of textlines in a table
MINIMUM_TEXTLINES_IN_TABLE = 6
def column_spread(left, right, col_anchors):
"""Get the number of columns crossed by a segment [left, right]."""
index_left = 0
while index_left < len(col_anchors) \
and col_anchors[index_left] < left:
index_left += 1
index_right = index_left
while index_right < len(col_anchors) \
and col_anchors[index_right] < right:
index_right += 1
return index_right - index_left
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
"""Expand a bbox vertically up by looking for plausible headers. """Expand a bbox vertically up by looking for plausible headers.
@ -40,19 +58,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
(left, bottom, right, top) = body_bbox (left, bottom, right, top) = body_bbox
zones = [] zones = []
def column_spread(left, right, col_anchors):
"""Get the number of columns crossed by a segment [left, right]."""
indexLeft = 0
while indexLeft < len(col_anchors) \
and col_anchors[indexLeft] < left:
indexLeft += 1
indexRight = indexLeft
while indexRight < len(col_anchors) \
and col_anchors[indexRight] < right:
indexRight += 1
return indexRight - indexLeft
keep_searching = True keep_searching = True
while keep_searching: while keep_searching:
keep_searching = False keep_searching = False
@ -128,8 +133,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
# This is to avoid picking unrelated paragraphs. # This is to avoid picking unrelated paragraphs.
if max_spread <= min( if max_spread <= min(
MAX_COL_SPREAD_IN_HEADER, MAX_COL_SPREAD_IN_HEADER,
math.ceil(len(col_anchors) / 2) math.ceil(len(col_anchors) / 2)):
):
# Combined, the elements we've identified don't cross more # Combined, the elements we've identified don't cross more
# than the authorized number of columns. # than the authorized number of columns.
# We're trying to avoid # We're trying to avoid
@ -145,7 +149,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
return new_bbox return new_bbox
class AlignmentCounter(object): class AlignmentCounter():
""" """
For a given textline, represent all other textlines aligned with it. For a given textline, represent all other textlines aligned with it.
@ -260,7 +264,7 @@ class TextNetworks(TextAlignments):
removed_singletons = True removed_singletons = True
while removed_singletons: while removed_singletons:
removed_singletons = False removed_singletons = False
for alignment_id, textalignments in self._text_alignments.items(): for textalignments in self._text_alignments.values():
# For each alignment edge, remove items if they are singletons # For each alignment edge, remove items if they are singletons
# either horizontally or vertically # either horizontally or vertically
for ta in textalignments: for ta in textalignments:
@ -308,8 +312,8 @@ class TextNetworks(TextAlignments):
# Retrieve the list of textlines it's aligned with, across both # Retrieve the list of textlines it's aligned with, across both
# axis # axis
best_alignment = self._textline_to_alignments[most_aligned_tl] best_alignment = self._textline_to_alignments[most_aligned_tl]
ref_h_alignment_id, ref_h_textlines = best_alignment.max_h() __, ref_h_textlines = best_alignment.max_h()
ref_v_alignment_id, ref_v_textlines = best_alignment.max_v() __, ref_v_textlines = best_alignment.max_v()
if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1: if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
return None return None
@ -375,7 +379,6 @@ class TextNetworks(TextAlignments):
else: else:
parse_details_search = None parse_details_search = None
MINIMUM_TEXTLINES_IN_TABLE = 6
bbox = (most_aligned_tl.x0, most_aligned_tl.y0, bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1) most_aligned_tl.x1, most_aligned_tl.y1)
@ -402,12 +405,7 @@ class TextNetworks(TextAlignments):
# if the textline is close. # if the textline is close.
if h_distance < max_h_gap and v_distance < max_v_gap: if h_distance < max_h_gap and v_distance < max_v_gap:
tls_in_bbox.append(tl) tls_in_bbox.append(tl)
bbox = ( bbox = expand_bbox_with_textline(bbox, tl)
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
del tls_search_space[i] del tls_search_space[i]
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE: if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
return bbox return bbox
@ -472,8 +470,7 @@ class Hybrid(TextBaseParser):
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
debug=False, debug=False,
**kwargs **kwargs):
):
super().__init__( super().__init__(
"hybrid", "hybrid",
table_regions=table_regions, table_regions=table_regions,

View File

@ -100,8 +100,7 @@ class Lattice(BaseParser):
threshold_constant=-2, threshold_constant=-2,
iterations=0, iterations=0,
resolution=300, resolution=300,
**kwargs **kwargs):
):
super().__init__( super().__init__(
"lattice", "lattice",
table_regions=table_regions, table_regions=table_regions,

View File

@ -60,8 +60,7 @@ class Stream(TextBaseParser):
edge_tol=50, edge_tol=50,
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
**kwargs **kwargs):
):
super().__init__( super().__init__(
"stream", "stream",
table_regions=table_regions, table_regions=table_regions,

View File

@ -136,7 +136,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
return ax return ax
class PlotMethods(object): class PlotMethods():
def __call__(self, table, kind="text", filename=None, ax=None): def __call__(self, table, kind="text", filename=None, ax=None):
"""Plot elements found on PDF page based on kind """Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different specified, useful for debugging and playing with different

View File

@ -156,7 +156,7 @@ def remove_extra(kwargs, flavor="lattice"):
# https://stackoverflow.com/a/22726782 # https://stackoverflow.com/a/22726782
# and https://stackoverflow.com/questions/10965479 # and https://stackoverflow.com/questions/10965479
class TemporaryDirectory(object): class TemporaryDirectory():
def __enter__(self): def __enter__(self):
self.name = tempfile.mkdtemp() self.name = tempfile.mkdtemp()
# Only delete the temporary directory upon # Only delete the temporary directory upon
@ -488,6 +488,17 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
return t_bbox return t_bbox
def expand_bbox_with_textline(bbox, textline):
"""Expand (if needed) a bbox so that it fits the parameter textline.
"""
return (
min(bbox[0], textline.x0),
min(bbox[1], textline.y0),
max(bbox[2], textline.x1),
max(bbox[3], textline.y1)
)
def bbox_from_textlines(textlines): def bbox_from_textlines(textlines):
"""Returns the smallest bbox containing all the text objects passed as """Returns the smallest bbox containing all the text objects passed as
a parameters. a parameters.
@ -514,12 +525,7 @@ def bbox_from_textlines(textlines):
) )
for tl in textlines[1:]: for tl in textlines[1:]:
bbox = ( bbox = expand_bbox_with_textline(bbox, tl)
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
return bbox return bbox
@ -1044,8 +1050,7 @@ def get_page_layout(
line_margin=0.5, line_margin=0.5,
word_margin=0.1, word_margin=0.1,
detect_vertical=True, detect_vertical=True,
all_texts=True, all_texts=True):
):
"""Returns a PDFMiner LTPage object and page dimension of a single """Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs. of kwargs.
@ -1163,14 +1168,14 @@ def compare_tables(left, right):
diff_cols = right.shape[1]-left.shape[1] diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0] diff_rows = right.shape[0]-left.shape[0]
differences = [] differences = []
if (diff_rows): if diff_rows:
differences.append( differences.append(
"{diff_rows} {more_fewer} rows".format( "{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows), diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows > 0 else 'fewer' more_fewer='more' if diff_rows > 0 else 'fewer'
) )
) )
if (diff_cols): if diff_cols:
differences.append( differences.append(
"{diff_cols} {more_fewer} columns".format( "{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols), diff_cols=abs(diff_cols),