Linting
parent
f7aafcd05c
commit
e1572a10c9
|
|
@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
|
|||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class Config(object):
|
||||
class Config():
|
||||
def __init__(self):
|
||||
self.config = {}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
|||
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
||||
|
||||
|
||||
class TextAlignment(object):
|
||||
class TextAlignment():
|
||||
"""Represents a list of textlines sharing an alignment on a coordinate.
|
||||
|
||||
The alignment can be left/right/middle or top/bottom/center.
|
||||
|
|
@ -137,7 +137,7 @@ class TextEdge(TextAlignment):
|
|||
self.is_valid = True
|
||||
|
||||
|
||||
class TextAlignments(object):
|
||||
class TextAlignments():
|
||||
"""Defines a dict of text edges across reference alignments.
|
||||
"""
|
||||
|
||||
|
|
@ -327,7 +327,7 @@ class TextEdges(TextAlignments):
|
|||
return table_areas_padded
|
||||
|
||||
|
||||
class Cell(object):
|
||||
class Cell():
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
left-bottom origin. (PDF coordinate space)
|
||||
|
||||
|
|
@ -409,7 +409,7 @@ class Cell(object):
|
|||
return self.top + self.bottom + self.left + self.right
|
||||
|
||||
|
||||
class Table(object):
|
||||
class Table():
|
||||
"""Defines a table with coordinates relative to a left-bottom
|
||||
origin. (PDF coordinate space)
|
||||
|
||||
|
|
@ -815,7 +815,7 @@ class Table(object):
|
|||
return self
|
||||
|
||||
|
||||
class TableList(object):
|
||||
class TableList():
|
||||
"""Defines a list of camelot.core.Table objects. Each table can
|
||||
be accessed using its index.
|
||||
|
||||
|
|
|
|||
|
|
@ -19,24 +19,24 @@ from ..utils import (
|
|||
from ..core import Table
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
class BaseParser():
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
copy_text=None,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
shift_text=None,
|
||||
flag_size=False,
|
||||
debug=False
|
||||
):
|
||||
self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
copy_text=None,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
shift_text=None,
|
||||
flag_size=False,
|
||||
debug=False):
|
||||
self.id = parser_id
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.table_bbox = {}
|
||||
|
||||
self.copy_text = copy_text
|
||||
self.split_text = split_text
|
||||
|
|
@ -49,7 +49,9 @@ class BaseParser(object):
|
|||
self.t_bbox = None
|
||||
|
||||
# For plotting details of parsing algorithms
|
||||
self.parse_details = {} if debug else None
|
||||
self.parse_details = {}
|
||||
if not debug:
|
||||
self.parse_details = None
|
||||
|
||||
def prepare_page_parse(self, filename, layout, dimensions,
|
||||
page_idx, layout_kwargs):
|
||||
|
|
@ -177,6 +179,18 @@ class BaseParser(object):
|
|||
table.cells[r_idx][c_idx].text = text
|
||||
return pos_errors
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
def extract_tables(self):
|
||||
if self._document_has_no_text():
|
||||
return []
|
||||
|
|
@ -188,8 +202,12 @@ class BaseParser(object):
|
|||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, bbox in enumerate(
|
||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||
):
|
||||
sorted(
|
||||
self.table_bbox.keys(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True
|
||||
)
|
||||
):
|
||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||
bbox,
|
||||
table_idx
|
||||
|
|
@ -232,20 +250,19 @@ class TextBaseParser(BaseParser):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs
|
||||
):
|
||||
self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
parser_id,
|
||||
table_regions=table_regions,
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
from __future__ import division
|
||||
|
||||
import numpy as np
|
||||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from .base import TextBaseParser
|
||||
from ..core import (
|
||||
|
|
@ -16,6 +16,7 @@ from ..core import (
|
|||
)
|
||||
from ..utils import (
|
||||
bbox_from_str,
|
||||
expand_bbox_with_textline,
|
||||
text_in_bbox,
|
||||
bbox_from_textlines,
|
||||
distance_tl_to_bbox,
|
||||
|
|
@ -25,6 +26,23 @@ from ..utils import (
|
|||
# maximum number of columns over which a header can spread
|
||||
MAX_COL_SPREAD_IN_HEADER = 3
|
||||
|
||||
# Minimum number of textlines in a table
|
||||
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||
|
||||
|
||||
def column_spread(left, right, col_anchors):
|
||||
"""Get the number of columns crossed by a segment [left, right]."""
|
||||
index_left = 0
|
||||
while index_left < len(col_anchors) \
|
||||
and col_anchors[index_left] < left:
|
||||
index_left += 1
|
||||
index_right = index_left
|
||||
while index_right < len(col_anchors) \
|
||||
and col_anchors[index_right] < right:
|
||||
index_right += 1
|
||||
|
||||
return index_right - index_left
|
||||
|
||||
|
||||
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||
"""Expand a bbox vertically up by looking for plausible headers.
|
||||
|
|
@ -40,19 +58,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
(left, bottom, right, top) = body_bbox
|
||||
zones = []
|
||||
|
||||
def column_spread(left, right, col_anchors):
|
||||
"""Get the number of columns crossed by a segment [left, right]."""
|
||||
indexLeft = 0
|
||||
while indexLeft < len(col_anchors) \
|
||||
and col_anchors[indexLeft] < left:
|
||||
indexLeft += 1
|
||||
indexRight = indexLeft
|
||||
while indexRight < len(col_anchors) \
|
||||
and col_anchors[indexRight] < right:
|
||||
indexRight += 1
|
||||
|
||||
return indexRight - indexLeft
|
||||
|
||||
keep_searching = True
|
||||
while keep_searching:
|
||||
keep_searching = False
|
||||
|
|
@ -127,9 +132,8 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
# columns.
|
||||
# This is to avoid picking unrelated paragraphs.
|
||||
if max_spread <= min(
|
||||
MAX_COL_SPREAD_IN_HEADER,
|
||||
math.ceil(len(col_anchors) / 2)
|
||||
):
|
||||
MAX_COL_SPREAD_IN_HEADER,
|
||||
math.ceil(len(col_anchors) / 2)):
|
||||
# Combined, the elements we've identified don't cross more
|
||||
# than the authorized number of columns.
|
||||
# We're trying to avoid
|
||||
|
|
@ -145,7 +149,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
return new_bbox
|
||||
|
||||
|
||||
class AlignmentCounter(object):
|
||||
class AlignmentCounter():
|
||||
"""
|
||||
For a given textline, represent all other textlines aligned with it.
|
||||
|
||||
|
|
@ -260,7 +264,7 @@ class TextNetworks(TextAlignments):
|
|||
removed_singletons = True
|
||||
while removed_singletons:
|
||||
removed_singletons = False
|
||||
for alignment_id, textalignments in self._text_alignments.items():
|
||||
for textalignments in self._text_alignments.values():
|
||||
# For each alignment edge, remove items if they are singletons
|
||||
# either horizontally or vertically
|
||||
for ta in textalignments:
|
||||
|
|
@ -283,7 +287,7 @@ class TextNetworks(TextAlignments):
|
|||
return max(
|
||||
self._textline_to_alignments.keys(),
|
||||
key=lambda textline:
|
||||
self._textline_to_alignments[textline].alignment_score(),
|
||||
self._textline_to_alignments[textline].alignment_score(),
|
||||
default=None
|
||||
)
|
||||
|
||||
|
|
@ -308,8 +312,8 @@ class TextNetworks(TextAlignments):
|
|||
# Retrieve the list of textlines it's aligned with, across both
|
||||
# axis
|
||||
best_alignment = self._textline_to_alignments[most_aligned_tl]
|
||||
ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
|
||||
ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
|
||||
__, ref_h_textlines = best_alignment.max_h()
|
||||
__, ref_v_textlines = best_alignment.max_v()
|
||||
if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
|
||||
return None
|
||||
|
||||
|
|
@ -375,7 +379,6 @@ class TextNetworks(TextAlignments):
|
|||
else:
|
||||
parse_details_search = None
|
||||
|
||||
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
||||
most_aligned_tl.x1, most_aligned_tl.y1)
|
||||
|
||||
|
|
@ -402,12 +405,7 @@ class TextNetworks(TextAlignments):
|
|||
# if the textline is close.
|
||||
if h_distance < max_h_gap and v_distance < max_v_gap:
|
||||
tls_in_bbox.append(tl)
|
||||
bbox = (
|
||||
min(bbox[0], tl.x0),
|
||||
min(bbox[1], tl.y0),
|
||||
max(bbox[2], tl.x1),
|
||||
max(bbox[3], tl.y1)
|
||||
)
|
||||
bbox = expand_bbox_with_textline(bbox, tl)
|
||||
del tls_search_space[i]
|
||||
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
||||
return bbox
|
||||
|
|
@ -461,19 +459,18 @@ class Hybrid(TextBaseParser):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=None,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs
|
||||
):
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=None,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
"hybrid",
|
||||
table_regions=table_regions,
|
||||
|
|
|
|||
|
|
@ -84,24 +84,23 @@ class Lattice(BaseParser):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
process_background=False,
|
||||
line_scale=15,
|
||||
copy_text=None,
|
||||
shift_text=None,
|
||||
split_text=False,
|
||||
flag_size=False,
|
||||
strip_text="",
|
||||
line_tol=2,
|
||||
joint_tol=2,
|
||||
threshold_blocksize=15,
|
||||
threshold_constant=-2,
|
||||
iterations=0,
|
||||
resolution=300,
|
||||
**kwargs
|
||||
):
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
process_background=False,
|
||||
line_scale=15,
|
||||
copy_text=None,
|
||||
shift_text=None,
|
||||
split_text=False,
|
||||
flag_size=False,
|
||||
strip_text="",
|
||||
line_tol=2,
|
||||
joint_tol=2,
|
||||
threshold_blocksize=15,
|
||||
threshold_constant=-2,
|
||||
iterations=0,
|
||||
resolution=300,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
"lattice",
|
||||
table_regions=table_regions,
|
||||
|
|
|
|||
|
|
@ -50,18 +50,17 @@ class Stream(TextBaseParser):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
**kwargs
|
||||
):
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
"stream",
|
||||
table_regions=table_regions,
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
|
|||
return ax
|
||||
|
||||
|
||||
class PlotMethods(object):
|
||||
class PlotMethods():
|
||||
def __call__(self, table, kind="text", filename=None, ax=None):
|
||||
"""Plot elements found on PDF page based on kind
|
||||
specified, useful for debugging and playing with different
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ def remove_extra(kwargs, flavor="lattice"):
|
|||
|
||||
# https://stackoverflow.com/a/22726782
|
||||
# and https://stackoverflow.com/questions/10965479
|
||||
class TemporaryDirectory(object):
|
||||
class TemporaryDirectory():
|
||||
def __enter__(self):
|
||||
self.name = tempfile.mkdtemp()
|
||||
# Only delete the temporary directory upon
|
||||
|
|
@ -488,6 +488,17 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
|
|||
return t_bbox
|
||||
|
||||
|
||||
def expand_bbox_with_textline(bbox, textline):
|
||||
"""Expand (if needed) a bbox so that it fits the parameter textline.
|
||||
"""
|
||||
return (
|
||||
min(bbox[0], textline.x0),
|
||||
min(bbox[1], textline.y0),
|
||||
max(bbox[2], textline.x1),
|
||||
max(bbox[3], textline.y1)
|
||||
)
|
||||
|
||||
|
||||
def bbox_from_textlines(textlines):
|
||||
"""Returns the smallest bbox containing all the text objects passed as
|
||||
a parameters.
|
||||
|
|
@ -514,12 +525,7 @@ def bbox_from_textlines(textlines):
|
|||
)
|
||||
|
||||
for tl in textlines[1:]:
|
||||
bbox = (
|
||||
min(bbox[0], tl.x0),
|
||||
min(bbox[1], tl.y0),
|
||||
max(bbox[2], tl.x1),
|
||||
max(bbox[3], tl.y1)
|
||||
)
|
||||
bbox = expand_bbox_with_textline(bbox, tl)
|
||||
return bbox
|
||||
|
||||
|
||||
|
|
@ -1039,13 +1045,12 @@ def compute_whitespace(d):
|
|||
|
||||
|
||||
def get_page_layout(
|
||||
filename,
|
||||
char_margin=1.0,
|
||||
line_margin=0.5,
|
||||
word_margin=0.1,
|
||||
detect_vertical=True,
|
||||
all_texts=True,
|
||||
):
|
||||
filename,
|
||||
char_margin=1.0,
|
||||
line_margin=0.5,
|
||||
word_margin=0.1,
|
||||
detect_vertical=True,
|
||||
all_texts=True):
|
||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||
of kwargs.
|
||||
|
|
@ -1163,14 +1168,14 @@ def compare_tables(left, right):
|
|||
diff_cols = right.shape[1]-left.shape[1]
|
||||
diff_rows = right.shape[0]-left.shape[0]
|
||||
differences = []
|
||||
if (diff_rows):
|
||||
if diff_rows:
|
||||
differences.append(
|
||||
"{diff_rows} {more_fewer} rows".format(
|
||||
diff_rows=abs(diff_rows),
|
||||
more_fewer='more' if diff_rows > 0 else 'fewer'
|
||||
)
|
||||
)
|
||||
if (diff_cols):
|
||||
if diff_cols:
|
||||
differences.append(
|
||||
"{diff_cols} {more_fewer} columns".format(
|
||||
diff_cols=abs(diff_cols),
|
||||
|
|
|
|||
Loading…
Reference in New Issue