Add support for region/area for hybrid
parent
f5fe92c22e
commit
e0e3ff4e07
|
|
@ -23,6 +23,7 @@ class BaseParser(object):
|
||||||
strip_text="",
|
strip_text="",
|
||||||
shift_text=None,
|
shift_text=None,
|
||||||
flag_size=False,
|
flag_size=False,
|
||||||
|
debug=False
|
||||||
):
|
):
|
||||||
self.id = parser_id
|
self.id = parser_id
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
|
|
@ -39,7 +40,7 @@ class BaseParser(object):
|
||||||
self.t_bbox = None
|
self.t_bbox = None
|
||||||
|
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
self.debug_info = {}
|
self.debug_info = {} if debug else None
|
||||||
|
|
||||||
def prepare_page_parse(self, filename, layout, dimensions,
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
page_idx, layout_kwargs):
|
page_idx, layout_kwargs):
|
||||||
|
|
@ -60,6 +61,10 @@ class BaseParser(object):
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
|
if self.debug_info is not None:
|
||||||
|
self.debug_info["table_regions"] = self.table_regions
|
||||||
|
self.debug_info["table_areas"] = self.table_areas
|
||||||
|
|
||||||
def _document_has_no_text(self):
|
def _document_has_no_text(self):
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
rootname = os.path.basename(self.rootname)
|
rootname = os.path.basename(self.rootname)
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import warnings
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
bbox_from_str,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
bbox_from_text,
|
bbox_from_text,
|
||||||
|
|
@ -21,6 +22,23 @@ from matplotlib import patches as patches
|
||||||
MAX_COL_SPREAD_IN_HEADER = 3
|
MAX_COL_SPREAD_IN_HEADER = 3
|
||||||
|
|
||||||
|
|
||||||
|
def plot_annotated_bbox(plot, bbox, text, rect_color):
|
||||||
|
plot.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(bbox[0], bbox[1]),
|
||||||
|
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
||||||
|
color="purple", linewidth=3,
|
||||||
|
fill=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
plot.text(
|
||||||
|
bbox[0], bbox[1],
|
||||||
|
text,
|
||||||
|
fontsize=12, color="black", verticalalignment="top",
|
||||||
|
bbox=dict(facecolor="purple", alpha=0.5)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||||
average_row_height):
|
average_row_height):
|
||||||
"""The core algorithm is based on fairly strict alignment of text.
|
"""The core algorithm is based on fairly strict alignment of text.
|
||||||
|
|
@ -273,6 +291,7 @@ class TextEdges2(object):
|
||||||
"center": (textline.y0 + textline.y1) / 2.0,
|
"center": (textline.y0 + textline.y1) / 2.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# FRHTODO: Move to utils and use generic name
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_index_closest_point(coord, edge_array):
|
def _get_index_closest_point(coord, edge_array):
|
||||||
"""Returns the index of the closest point
|
"""Returns the index of the closest point
|
||||||
|
|
@ -481,12 +500,63 @@ class TextEdges2(object):
|
||||||
default=None
|
default=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# First, determine the textline that has the most combined alignments
|
||||||
|
# across horizontal and vertical axis.
|
||||||
|
# It will serve both as a starting point for the table boundary search,
|
||||||
|
# and as a way to estimate the average spacing between rows/cols.
|
||||||
|
most_aligned_tl = get_best_textline(tls_search_space)
|
||||||
|
most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl)
|
||||||
|
|
||||||
|
# Retrieve the list of textlines it's aligned with, across both axis
|
||||||
|
best_alignment = self._textlines_alignments[most_aligned_tl]
|
||||||
|
ref_h_edge_name = best_alignment.max_h_edge_name()
|
||||||
|
ref_v_edge_name = best_alignment.max_v_edge_name()
|
||||||
|
best_h_textedges = self._textedges[ref_h_edge_name]
|
||||||
|
best_v_textedges = self._textedges[ref_v_edge_name]
|
||||||
|
h_coord = most_aligned_coords[ref_h_edge_name]
|
||||||
|
v_coord = most_aligned_coords[ref_v_edge_name]
|
||||||
|
h_textlines = sorted(
|
||||||
|
best_h_textedges[
|
||||||
|
TextEdges2._get_index_closest_point(
|
||||||
|
h_coord,
|
||||||
|
best_h_textedges
|
||||||
|
)
|
||||||
|
].textlines,
|
||||||
|
key=lambda tl: tl.x0,
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
v_textlines = sorted(
|
||||||
|
best_v_textedges[
|
||||||
|
TextEdges2._get_index_closest_point(
|
||||||
|
v_coord,
|
||||||
|
best_v_textedges
|
||||||
|
)
|
||||||
|
].textlines,
|
||||||
|
key=lambda tl: tl.y0,
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
h_gaps, v_gaps = [], []
|
||||||
|
for i in range(1, len(v_textlines)):
|
||||||
|
v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0)
|
||||||
|
for i in range(1, len(h_textlines)):
|
||||||
|
h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0)
|
||||||
|
|
||||||
|
if (not h_gaps or not v_gaps):
|
||||||
|
return None
|
||||||
|
percentile = 75
|
||||||
|
gaps_hv = (
|
||||||
|
np.percentile(h_gaps, percentile),
|
||||||
|
np.percentile(v_gaps, percentile)
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate the 75th percentile of the horizontal/vertical
|
# Calculate the 75th percentile of the horizontal/vertical
|
||||||
# gaps between textlines. Use this as a reference for a threshold
|
# gaps between textlines. Use this as a reference for a threshold
|
||||||
# to not exceed while looking for table boundaries.
|
# to not exceed while looking for table boundaries.
|
||||||
gaps_hv = self._calculate_gaps_thresholds(75)
|
# FRHTODO: Clean this up
|
||||||
if (gaps_hv[0] is None or gaps_hv[1] is None):
|
# gaps_hv = self._calculate_gaps_thresholds(75)
|
||||||
return None
|
# if (gaps_hv[0] is None or gaps_hv[1] is None):
|
||||||
|
# return None
|
||||||
max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3
|
max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3
|
||||||
|
|
||||||
if debug_info is not None:
|
if debug_info is not None:
|
||||||
|
|
@ -501,11 +571,10 @@ class TextEdges2(object):
|
||||||
debug_info_search = None
|
debug_info_search = None
|
||||||
|
|
||||||
MINIMUM_TEXTLINES_IN_TABLE = 6
|
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||||
tl_most_aligned = get_best_textline(tls_search_space)
|
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
||||||
bbox = (tl_most_aligned.x0, tl_most_aligned.y0,
|
most_aligned_tl.x1, most_aligned_tl.y1)
|
||||||
tl_most_aligned.x1, tl_most_aligned.y1)
|
tls_search_space.remove(most_aligned_tl)
|
||||||
tls_search_space.remove(tl_most_aligned)
|
tls_in_bbox = [most_aligned_tl]
|
||||||
tls_in_bbox = [tl_most_aligned]
|
|
||||||
last_bbox = None
|
last_bbox = None
|
||||||
while last_bbox != bbox:
|
while last_bbox != bbox:
|
||||||
if debug_info_search is not None:
|
if debug_info_search is not None:
|
||||||
|
|
@ -581,6 +650,19 @@ class TextEdges2(object):
|
||||||
def plotFRHTableSearch(self, plot, debug_info):
|
def plotFRHTableSearch(self, plot, debug_info):
|
||||||
if debug_info is None:
|
if debug_info is None:
|
||||||
return
|
return
|
||||||
|
# Display a bbox per region
|
||||||
|
for region_str in debug_info["table_regions"] or []:
|
||||||
|
plot_annotated_bbox(
|
||||||
|
plot, bbox_from_str(region_str),
|
||||||
|
"region: ({region_str})".format(region_str=region_str),
|
||||||
|
"purple"
|
||||||
|
)
|
||||||
|
# Display a bbox per area
|
||||||
|
for area_str in debug_info["table_areas"] or []:
|
||||||
|
plot_annotated_bbox(
|
||||||
|
plot, bbox_from_str(area_str),
|
||||||
|
"area: ({area_str})".format(area_str=area_str), "pink"
|
||||||
|
)
|
||||||
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
|
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
|
||||||
max_h_gap = bbox_search["max_h_gap"]
|
max_h_gap = bbox_search["max_h_gap"]
|
||||||
max_v_gap = bbox_search["max_v_gap"]
|
max_v_gap = bbox_search["max_v_gap"]
|
||||||
|
|
@ -891,7 +973,26 @@ class Hybrid(BaseParser):
|
||||||
|
|
||||||
# FRHTODO: get debug_info to work again
|
# FRHTODO: get debug_info to work again
|
||||||
def _generate_table_bbox(self, debug_info=None):
|
def _generate_table_bbox(self, debug_info=None):
|
||||||
textlines = self.horizontal_text + self.vertical_text
|
if self.table_areas is not None:
|
||||||
|
table_bbox = {}
|
||||||
|
for area_str in self.table_areas:
|
||||||
|
table_bbox[bbox_from_str(area_str)] = None
|
||||||
|
self.table_bbox = table_bbox
|
||||||
|
return
|
||||||
|
|
||||||
|
all_textlines = self.horizontal_text + self.vertical_text
|
||||||
|
textlines = []
|
||||||
|
if self.table_regions is None:
|
||||||
|
textlines = all_textlines
|
||||||
|
else:
|
||||||
|
# filter text
|
||||||
|
for region_str in self.table_regions:
|
||||||
|
region_text = text_in_bbox(
|
||||||
|
bbox_from_str(region_str),
|
||||||
|
all_textlines
|
||||||
|
)
|
||||||
|
textlines.extend(region_text)
|
||||||
|
|
||||||
textlines_processed = {}
|
textlines_processed = {}
|
||||||
self.table_bbox = {}
|
self.table_bbox = {}
|
||||||
if debug_info is not None:
|
if debug_info is not None:
|
||||||
|
|
@ -1053,7 +1154,7 @@ class Hybrid(BaseParser):
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
# Identify plausible areas within the doc where tables lie,
|
||||||
# populate table_bbox keys with these areas.
|
# populate table_bbox keys with these areas.
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox(debug_info)
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import numpy as np
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox, text_in_bbox_per_axis)
|
from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -307,26 +307,17 @@ class Stream(BaseParser):
|
||||||
if self.table_regions is not None:
|
if self.table_regions is not None:
|
||||||
# filter horizontal text
|
# filter horizontal text
|
||||||
hor_text = []
|
hor_text = []
|
||||||
for region in self.table_regions:
|
for region_str in self.table_regions:
|
||||||
x1, y1, x2, y2 = region.split(",")
|
|
||||||
x1 = float(x1)
|
|
||||||
y1 = float(y1)
|
|
||||||
x2 = float(x2)
|
|
||||||
y2 = float(y2)
|
|
||||||
region_text = text_in_bbox(
|
region_text = text_in_bbox(
|
||||||
(x1, y2, x2, y1), self.horizontal_text)
|
bbox_from_str(region_str),
|
||||||
|
self.horizontal_text)
|
||||||
hor_text.extend(region_text)
|
hor_text.extend(region_text)
|
||||||
# find tables based on nurminen's detection algorithm
|
# find tables based on nurminen's detection algorithm
|
||||||
table_bbox = self._nurminen_table_detection(hor_text)
|
table_bbox = self._nurminen_table_detection(hor_text)
|
||||||
else:
|
else:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
table_bbox[bbox_from_str(area_str)] = None
|
||||||
x1 = float(x1)
|
|
||||||
y1 = float(y1)
|
|
||||||
x2 = float(x2)
|
|
||||||
y2 = float(y2)
|
|
||||||
table_bbox[(x1, y2, x2, y1)] = None
|
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox = table_bbox
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
|
|
|
||||||
|
|
@ -389,6 +389,34 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_from_str(bbox_str):
|
||||||
|
"""Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox_str : str
|
||||||
|
Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2).
|
||||||
|
|
||||||
|
"""
|
||||||
|
x1, y1, x2, y2 = bbox_str.split(",")
|
||||||
|
x1 = float(x1)
|
||||||
|
y1 = float(y1)
|
||||||
|
x2 = float(x2)
|
||||||
|
y2 = float(y2)
|
||||||
|
# FRHTODO: do things still work if I do x1, y1, x2, y2?
|
||||||
|
return (
|
||||||
|
min(x1, x2),
|
||||||
|
min(y1, y2),
|
||||||
|
max(x1, x2),
|
||||||
|
max(y1, y2)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a bounding box.
|
"""Returns all text objects present inside a bounding box.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1297,6 +1297,10 @@ data_stream_two_tables_1 = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# The streaming algorithm incorrectly includes a header and a footer.
|
||||||
|
# Trimming the table for the test of hybrid, which doesn't include it.
|
||||||
|
data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1]
|
||||||
|
|
||||||
data_stream_two_tables_2 = [
|
data_stream_two_tables_2 = [
|
||||||
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
||||||
[
|
[
|
||||||
|
|
@ -1605,6 +1609,10 @@ data_stream_two_tables_2 = [
|
||||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# The streaming algorithm incorrectly includes a header and a footer.
|
||||||
|
# Trimming the table for the test of hybrid, which doesn't include it.
|
||||||
|
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
||||||
|
|
||||||
data_stream_table_areas = [
|
data_stream_table_areas = [
|
||||||
["", "One Withholding"],
|
["", "One Withholding"],
|
||||||
["Payroll Period", "Allowance"],
|
["Payroll Period", "Allowance"],
|
||||||
|
|
|
||||||
|
|
@ -175,8 +175,8 @@ def test_hybrid_table_rotated():
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_two_tables():
|
def test_hybrid_two_tables():
|
||||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
df1 = pd.DataFrame(data_hybrid_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
df2 = pd.DataFrame(data_hybrid_two_tables_2)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
|
@ -190,8 +190,10 @@ def test_hybrid_table_regions():
|
||||||
df = pd.DataFrame(data_stream_table_areas)
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
|
# The "stream" test looks for a region in ["320,460,573,335"], which
|
||||||
|
# should exclude the header.
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", table_regions=["320,460,573,335"]
|
filename, flavor="hybrid", table_regions=["320,505,573,330"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue