Improve column detection for hybrid flavor
No longer rely on the mode but on the parsing analysis during network detection. Added unit test for complex table with vertical header and mixed horizontal / vertical text.pull/153/head
|
|
@ -6,7 +6,6 @@ from __future__ import division
|
||||||
import copy
|
import copy
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .base import TextBaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import (
|
from ..core import (
|
||||||
|
|
@ -18,6 +17,7 @@ from ..core import (
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
|
textlines_overlapping_bbox,
|
||||||
bbox_from_textlines,
|
bbox_from_textlines,
|
||||||
find_columns_coordinates,
|
find_columns_coordinates,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
|
|
@ -321,11 +321,17 @@ class TextNetworks(TextAlignments):
|
||||||
horizontal axis.
|
horizontal axis.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Find the textline with the highest alignment score
|
# Find the textline with the highest alignment score, with a tie break
|
||||||
|
# to prefer textlines further down in the table. Starting the search
|
||||||
|
# from the table's bottom allows the algo to collect data on more cells
|
||||||
|
# before going to the header, typically harder to parse.
|
||||||
return max(
|
return max(
|
||||||
self._textline_to_alignments.keys(),
|
self._textline_to_alignments.keys(),
|
||||||
key=lambda textline:
|
key=lambda textline:
|
||||||
self._textline_to_alignments[textline].alignment_score(),
|
(
|
||||||
|
self._textline_to_alignments[textline].alignment_score(),
|
||||||
|
-textline.y0
|
||||||
|
),
|
||||||
default=None
|
default=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -566,12 +572,13 @@ class Hybrid(TextBaseParser):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
|
user_provided_bboxes = None
|
||||||
if self.table_areas is not None:
|
if self.table_areas is not None:
|
||||||
table_bbox = {}
|
# User gave us table areas already. We will use their coordinates
|
||||||
|
# to find column anchors.
|
||||||
|
user_provided_bboxes = []
|
||||||
for area_str in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
table_bbox[bbox_from_str(area_str)] = None
|
user_provided_bboxes.append(bbox_from_str(area_str))
|
||||||
self.table_bbox = table_bbox
|
|
||||||
return
|
|
||||||
|
|
||||||
# Take all the textlines that are not just spaces
|
# Take all the textlines that are not just spaces
|
||||||
all_textlines = [
|
all_textlines = [
|
||||||
|
|
@ -593,59 +600,73 @@ class Hybrid(TextBaseParser):
|
||||||
parse_details_bbox_searches = None
|
parse_details_bbox_searches = None
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
text_network = TextNetworks()
|
# Find a bbox: either pulling from the user's or from the network
|
||||||
text_network.generate(textlines)
|
# algorithm.
|
||||||
text_network._remove_unconnected_edges()
|
|
||||||
gaps_hv = text_network._compute_plausible_gaps()
|
|
||||||
if gaps_hv is None:
|
|
||||||
return None
|
|
||||||
# edge_tol instructions override the calculated vertical gap
|
|
||||||
edge_tol_hv = (
|
|
||||||
gaps_hv[0],
|
|
||||||
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
|
||||||
)
|
|
||||||
bbox = text_network._build_bbox_candidate(
|
|
||||||
edge_tol_hv,
|
|
||||||
parse_details=parse_details_bbox_searches
|
|
||||||
)
|
|
||||||
if bbox is None:
|
|
||||||
break
|
|
||||||
|
|
||||||
if parse_details_network_searches is not None:
|
# First look for the body of the table
|
||||||
# Preserve the current edge calculation for display debugging
|
bbox_body = None
|
||||||
parse_details_network_searches.append(
|
if user_provided_bboxes is not None:
|
||||||
copy.deepcopy(text_network)
|
if len(user_provided_bboxes) > 0:
|
||||||
|
bbox_body = user_provided_bboxes.pop()
|
||||||
|
else:
|
||||||
|
text_network = TextNetworks()
|
||||||
|
text_network.generate(textlines)
|
||||||
|
text_network._remove_unconnected_edges()
|
||||||
|
gaps_hv = text_network._compute_plausible_gaps()
|
||||||
|
if gaps_hv is None:
|
||||||
|
return None
|
||||||
|
# edge_tol instructions override the calculated vertical gap
|
||||||
|
edge_tol_hv = (
|
||||||
|
gaps_hv[0],
|
||||||
|
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
||||||
|
)
|
||||||
|
bbox_body = text_network._build_bbox_candidate(
|
||||||
|
edge_tol_hv,
|
||||||
|
parse_details=parse_details_bbox_searches
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get all the textlines that are at least 50% in the box
|
if parse_details_network_searches is not None:
|
||||||
tls_in_bbox = text_in_bbox(bbox, textlines)
|
# Preserve the current edge calculation for debugging
|
||||||
|
parse_details_network_searches.append(
|
||||||
|
copy.deepcopy(text_network)
|
||||||
|
)
|
||||||
|
|
||||||
# and expand the text box to fully contain them
|
if bbox_body is None:
|
||||||
bbox = bbox_from_textlines(tls_in_bbox)
|
break
|
||||||
|
|
||||||
# FRH: do we need to repeat this?
|
# Get all the textlines that overlap with the box, compute
|
||||||
# tls_in_bbox = text_in_bbox(bbox, textlines)
|
# columns
|
||||||
|
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
|
||||||
cols_anchors = find_columns_coordinates(tls_in_bbox)
|
cols_anchors = find_columns_coordinates(tls_in_bbox)
|
||||||
|
|
||||||
# Apply a heuristic to salvage headers which formatting might be
|
# Unless the user gave us strict bbox_body, try to find a header
|
||||||
# off compared to the rest of the table.
|
# above the body to build the full bbox.
|
||||||
expanded_bbox = search_header_from_body_bbox(
|
if user_provided_bboxes is not None:
|
||||||
bbox,
|
bbox_full = bbox_body
|
||||||
textlines,
|
else:
|
||||||
cols_anchors,
|
# Expand the text box to fully contain the tls we found
|
||||||
gaps_hv[1]
|
bbox_body = bbox_from_textlines(tls_in_bbox)
|
||||||
)
|
|
||||||
|
# Apply a heuristic to salvage headers which formatting might
|
||||||
|
# be off compared to the rest of the table.
|
||||||
|
bbox_full = search_header_from_body_bbox(
|
||||||
|
bbox_body,
|
||||||
|
textlines,
|
||||||
|
cols_anchors,
|
||||||
|
gaps_hv[1]
|
||||||
|
)
|
||||||
|
|
||||||
|
table_parse = {
|
||||||
|
"bbox_body": bbox_body,
|
||||||
|
"cols_anchors": cols_anchors,
|
||||||
|
"bbox_full": bbox_full
|
||||||
|
}
|
||||||
|
self.table_bbox[bbox_full] = table_parse
|
||||||
|
|
||||||
if self.parse_details is not None:
|
if self.parse_details is not None:
|
||||||
if "col_searches" not in self.parse_details:
|
if "col_searches" not in self.parse_details:
|
||||||
self.parse_details["col_searches"] = []
|
self.parse_details["col_searches"] = []
|
||||||
self.parse_details["col_searches"].append({
|
self.parse_details["col_searches"].append(table_parse)
|
||||||
"core_bbox": bbox,
|
|
||||||
"cols_anchors": cols_anchors,
|
|
||||||
"expanded_bbox": expanded_bbox
|
|
||||||
})
|
|
||||||
|
|
||||||
self.table_bbox[expanded_bbox] = None
|
|
||||||
|
|
||||||
# Remember what textlines we processed, and repeat
|
# Remember what textlines we processed, and repeat
|
||||||
for tl in tls_in_bbox:
|
for tl in tls_in_bbox:
|
||||||
|
|
@ -682,7 +703,6 @@ class Hybrid(TextBaseParser):
|
||||||
# the alignment identification work we've done earlier.
|
# the alignment identification work we've done earlier.
|
||||||
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
if self.columns is not None and self.columns[table_idx] != "":
|
||||||
# user has to input boundary columns too
|
# user has to input boundary columns too
|
||||||
|
|
@ -695,53 +715,11 @@ class Hybrid(TextBaseParser):
|
||||||
cols.append(text_x_max)
|
cols.append(text_x_max)
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
else:
|
else:
|
||||||
# calculate mode of the list of number of elements in
|
parse_details = self.table_bbox[bbox]
|
||||||
# each row to guess the number of columns
|
col_anchors = parse_details["cols_anchors"]
|
||||||
ncols = max(set(elements), key=elements.count)
|
cols = list(map(
|
||||||
if ncols == 1:
|
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
|
||||||
# if mode is 1, the page usually contains not tables
|
range(0, len(col_anchors) - 1)
|
||||||
# but there can be cases where the list can be skewed,
|
))
|
||||||
# try to remove all 1s from list in this case and
|
|
||||||
# see if the list contains elements, if yes, then use
|
|
||||||
# the mode after removing 1s
|
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
|
||||||
if elements:
|
|
||||||
ncols = max(set(elements), key=elements.count)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found in table area {}"
|
|
||||||
.format(table_idx + 1)
|
|
||||||
)
|
|
||||||
cols = [
|
|
||||||
(t.x0, t.x1)
|
|
||||||
for r in rows_grouped
|
|
||||||
if len(r) == ncols
|
|
||||||
for t in r
|
|
||||||
]
|
|
||||||
cols = self._merge_columns(
|
|
||||||
sorted(cols),
|
|
||||||
column_tol=self.column_tol
|
|
||||||
)
|
|
||||||
inner_text = []
|
|
||||||
for i in range(1, len(cols)):
|
|
||||||
left = cols[i - 1][1]
|
|
||||||
right = cols[i][0]
|
|
||||||
inner_text.extend(
|
|
||||||
[
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > left and t.x1 < right
|
|
||||||
]
|
|
||||||
)
|
|
||||||
outer_text = [
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
|
||||||
]
|
|
||||||
inner_text.extend(outer_text)
|
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
|
||||||
|
|
||||||
return cols, rows, None, None
|
return cols, rows, None, None
|
||||||
|
|
|
||||||
|
|
@ -472,7 +472,7 @@ class PlotMethods():
|
||||||
|
|
||||||
for box_id, col_search in enumerate(parse_details["col_searches"]):
|
for box_id, col_search in enumerate(parse_details["col_searches"]):
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, col_search["expanded_bbox"],
|
ax, col_search["bbox_full"],
|
||||||
"box body + header #{box_id}".format(
|
"box body + header #{box_id}".format(
|
||||||
box_id=box_id
|
box_id=box_id
|
||||||
),
|
),
|
||||||
|
|
@ -481,7 +481,7 @@ class PlotMethods():
|
||||||
label_pos="top,left"
|
label_pos="top,left"
|
||||||
)
|
)
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, col_search["core_bbox"],
|
ax, col_search["bbox_body"],
|
||||||
"box body #{box_id}".format(
|
"box body #{box_id}".format(
|
||||||
box_id=box_id
|
box_id=box_id
|
||||||
),
|
),
|
||||||
|
|
@ -495,8 +495,8 @@ class PlotMethods():
|
||||||
ax.plot(
|
ax.plot(
|
||||||
[col_anchor, col_anchor],
|
[col_anchor, col_anchor],
|
||||||
[
|
[
|
||||||
col_search["core_bbox"][1] - 10,
|
col_search["bbox_body"][1] - 10,
|
||||||
col_search["core_bbox"][3] + 10,
|
col_search["bbox_body"][3] + 10,
|
||||||
],
|
],
|
||||||
color="green"
|
color="green"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -431,8 +431,36 @@ def bbox_from_str(bbox_str):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def textlines_overlapping_bbox(bbox, textlines):
|
||||||
|
"""Returns all text objects which overlap or are within a bounding box.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
textlines : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t_bbox : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
|
"""
|
||||||
|
(left, bottom, right, top) = bbox
|
||||||
|
t_bbox = [
|
||||||
|
t
|
||||||
|
for t in textlines
|
||||||
|
if ((left < t.x0 < right) or (left < t.x1 < right))
|
||||||
|
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
|
||||||
|
]
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects which lie at least 50% inside a bounding box.
|
"""Returns all text objects which lie at least 50% inside a bounding box
|
||||||
|
across both dimensions.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
|
||||||
447
tests/data.py
|
|
@ -1629,6 +1629,453 @@ data_hybrid_two_tables_b_2 = [
|
||||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
# Trimming the table for the test of hybrid, which doesn't include it.
|
||||||
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
||||||
|
|
||||||
|
data_hybrid_vertical_headers = [
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Congress-",
|
||||||
|
"Senator 36th",
|
||||||
|
"Rep106th",
|
||||||
|
"",
|
||||||
|
"Reg. of",
|
||||||
|
"Road",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Distri",
|
||||||
|
"Dist",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"1st Dist",
|
||||||
|
"Dist.",
|
||||||
|
"Dist.",
|
||||||
|
"",
|
||||||
|
"Deeds",
|
||||||
|
"",
|
||||||
|
"Commission",
|
||||||
|
"",
|
||||||
|
"District #1",
|
||||||
|
"ct #2",
|
||||||
|
"#3",
|
||||||
|
"",
|
||||||
|
"Dist #4",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Governor",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"U.S. Senator",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"Number of Registered voters",
|
||||||
|
"Poll Book Totals",
|
||||||
|
"Brian Calley",
|
||||||
|
"Patrick Colbeck",
|
||||||
|
"Jim Hines",
|
||||||
|
"Bill Schuette",
|
||||||
|
"John James",
|
||||||
|
"Sandy Pensler",
|
||||||
|
"",
|
||||||
|
"Jack Bergman",
|
||||||
|
"",
|
||||||
|
"Jim Stamas",
|
||||||
|
"Sue Allor",
|
||||||
|
"Melissa A. Cordes",
|
||||||
|
"",
|
||||||
|
"Al Scully",
|
||||||
|
"",
|
||||||
|
"Daniel G. Gauthier",
|
||||||
|
"Craig M. Clemens",
|
||||||
|
"Craig Johnston",
|
||||||
|
"Carolyn Brummund",
|
||||||
|
"Adam Brege",
|
||||||
|
"David Bielusiak",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Alcona",
|
||||||
|
"963",
|
||||||
|
"439",
|
||||||
|
"55",
|
||||||
|
"26",
|
||||||
|
"47",
|
||||||
|
"164",
|
||||||
|
"173",
|
||||||
|
"111",
|
||||||
|
"",
|
||||||
|
"268",
|
||||||
|
"",
|
||||||
|
"272",
|
||||||
|
"275",
|
||||||
|
"269",
|
||||||
|
"",
|
||||||
|
"271",
|
||||||
|
"",
|
||||||
|
"224",
|
||||||
|
"76",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Caledonia",
|
||||||
|
"923",
|
||||||
|
"393",
|
||||||
|
"40",
|
||||||
|
"23",
|
||||||
|
"45",
|
||||||
|
"158",
|
||||||
|
"150",
|
||||||
|
"103",
|
||||||
|
"",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"247",
|
||||||
|
"254",
|
||||||
|
"255",
|
||||||
|
"",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"139",
|
||||||
|
"143",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Curtis",
|
||||||
|
"1026",
|
||||||
|
"349",
|
||||||
|
"30",
|
||||||
|
"30",
|
||||||
|
"25",
|
||||||
|
"102",
|
||||||
|
"95",
|
||||||
|
"84",
|
||||||
|
"",
|
||||||
|
"159",
|
||||||
|
"",
|
||||||
|
"164",
|
||||||
|
"162",
|
||||||
|
"161",
|
||||||
|
"",
|
||||||
|
"157",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Greenbush",
|
||||||
|
"1212",
|
||||||
|
"423",
|
||||||
|
"56",
|
||||||
|
"26",
|
||||||
|
"40",
|
||||||
|
"126",
|
||||||
|
"104",
|
||||||
|
"131",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"213",
|
||||||
|
"214",
|
||||||
|
"215",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Gustin",
|
||||||
|
"611",
|
||||||
|
"180",
|
||||||
|
"22",
|
||||||
|
"35",
|
||||||
|
"17",
|
||||||
|
"55",
|
||||||
|
"73",
|
||||||
|
"45",
|
||||||
|
"",
|
||||||
|
"108",
|
||||||
|
"",
|
||||||
|
"104",
|
||||||
|
"111",
|
||||||
|
"111",
|
||||||
|
"",
|
||||||
|
"109",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"81",
|
||||||
|
"42",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Harrisville",
|
||||||
|
"1142",
|
||||||
|
"430",
|
||||||
|
"45",
|
||||||
|
"90",
|
||||||
|
"29",
|
||||||
|
"101",
|
||||||
|
"155",
|
||||||
|
"94",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"232",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"232",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Hawes",
|
||||||
|
"884",
|
||||||
|
"293",
|
||||||
|
"38",
|
||||||
|
"36",
|
||||||
|
"27",
|
||||||
|
"109",
|
||||||
|
"121",
|
||||||
|
"84",
|
||||||
|
"",
|
||||||
|
"192",
|
||||||
|
"",
|
||||||
|
"195",
|
||||||
|
"195",
|
||||||
|
"193",
|
||||||
|
"",
|
||||||
|
"184",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"118",
|
||||||
|
"87",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Haynes",
|
||||||
|
"626",
|
||||||
|
"275",
|
||||||
|
"31",
|
||||||
|
"20",
|
||||||
|
"32",
|
||||||
|
"104",
|
||||||
|
"121",
|
||||||
|
"53",
|
||||||
|
"",
|
||||||
|
"163",
|
||||||
|
"",
|
||||||
|
"163",
|
||||||
|
"173",
|
||||||
|
"161",
|
||||||
|
"",
|
||||||
|
"152",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"76",
|
||||||
|
"",
|
||||||
|
"69",
|
||||||
|
"31",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Mikado",
|
||||||
|
"781",
|
||||||
|
"208",
|
||||||
|
"19",
|
||||||
|
"39",
|
||||||
|
"17",
|
||||||
|
"81",
|
||||||
|
"90",
|
||||||
|
"63",
|
||||||
|
"",
|
||||||
|
"149",
|
||||||
|
"",
|
||||||
|
"149",
|
||||||
|
"145",
|
||||||
|
"147",
|
||||||
|
"",
|
||||||
|
"143",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"113",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Millen",
|
||||||
|
"353",
|
||||||
|
"139",
|
||||||
|
"7",
|
||||||
|
"16",
|
||||||
|
"13",
|
||||||
|
"38",
|
||||||
|
"49",
|
||||||
|
"19",
|
||||||
|
"",
|
||||||
|
"62",
|
||||||
|
"",
|
||||||
|
"66",
|
||||||
|
"67",
|
||||||
|
"66",
|
||||||
|
"",
|
||||||
|
"62",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Mitchell",
|
||||||
|
"327",
|
||||||
|
"96",
|
||||||
|
"12",
|
||||||
|
"17",
|
||||||
|
"7",
|
||||||
|
"29",
|
||||||
|
"41",
|
||||||
|
"17",
|
||||||
|
"",
|
||||||
|
"57",
|
||||||
|
"",
|
||||||
|
"55",
|
||||||
|
"57",
|
||||||
|
"60",
|
||||||
|
"",
|
||||||
|
"56",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"City Harrisville",
|
||||||
|
"389",
|
||||||
|
"171",
|
||||||
|
"16",
|
||||||
|
"15",
|
||||||
|
"18",
|
||||||
|
"35",
|
||||||
|
"49",
|
||||||
|
"31",
|
||||||
|
"",
|
||||||
|
"78",
|
||||||
|
"",
|
||||||
|
"80",
|
||||||
|
"82",
|
||||||
|
"81",
|
||||||
|
"",
|
||||||
|
"77",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"73",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Totals",
|
||||||
|
"9237",
|
||||||
|
"3396",
|
||||||
|
"371",
|
||||||
|
"373",
|
||||||
|
"317",
|
||||||
|
"1102",
|
||||||
|
"1221",
|
||||||
|
"835",
|
||||||
|
"0",
|
||||||
|
"1914",
|
||||||
|
"0",
|
||||||
|
"1934",
|
||||||
|
"1967",
|
||||||
|
"1963",
|
||||||
|
"0",
|
||||||
|
"1889",
|
||||||
|
"0",
|
||||||
|
"363",
|
||||||
|
"219",
|
||||||
|
"381",
|
||||||
|
"321",
|
||||||
|
"268",
|
||||||
|
"160",
|
||||||
|
],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data_stream_table_areas = [
|
data_stream_table_areas = [
|
||||||
["", "One Withholding"],
|
["", "One Withholding"],
|
||||||
["Payroll Period", "Allowance"],
|
["Payroll Period", "Allowance"],
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 103 KiB |
|
Before Width: | Height: | Size: 48 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 90 KiB |
|
Before Width: | Height: | Size: 101 KiB After Width: | Height: | Size: 101 KiB |
|
|
@ -194,6 +194,17 @@ def test_hybrid_two_tables_b():
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hybrid_vertical_header():
|
||||||
|
"""Tests a complex table with a vertically text header.
|
||||||
|
"""
|
||||||
|
df = pd.DataFrame(data_hybrid_vertical_headers)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
assert len(tables) == 1
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_table_regions():
|
def test_hybrid_table_regions():
|
||||||
df = pd.DataFrame(data_hybrid_table_regions)
|
df = pd.DataFrame(data_hybrid_table_regions)
|
||||||
|
|
||||||
|
|
|
||||||