WIP: Introduce actual hybrid parser

Create hybrid parser leverage both lattice and network techniques.
Simplify plotting of pdf in lattice case.
Rename "parser.table_bbox" into "parser.table_bbox_parses", since it
represents not a bbox but a dict of bbox to corresponding parsing data.

Still missing: more unit tests, plotting of steps.
pull/153/head
Frh 2020-05-04 16:27:01 -07:00
parent 6711f877bf
commit 77d289bd86
17 changed files with 1011 additions and 217 deletions

View File

@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
"Please specify output file format using --format") "Please specify output file format using --format")
tables = read_pdf( tables = read_pdf(
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs filepath, pages=pages, flavor="network",
suppress_stdout=quiet, **kwargs
) )
click.echo("Found {} tables".format(tables.n)) click.echo("Found {} tables".format(tables.n))
if plot_type is not None: if plot_type is not None:

View File

@ -454,7 +454,9 @@ class Table():
self.page = None self.page = None
self.flavor = None # Flavor of the parser used self.flavor = None # Flavor of the parser used
self.pdf_size = None # Dimensions of the original PDF page self.pdf_size = None # Dimensions of the original PDF page
self.parse_details = None # Field holding debug data self._bbox = None # Bounding box in original document
self.parse = None # Parse information
self.parse_details = None # Field holding extra debug data
self._image = None self._image = None
self._image_path = None # Temporary file to hold an image of the pdf self._image_path = None # Temporary file to hold an image of the pdf

View File

@ -7,7 +7,7 @@ import logging
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice, Network from .parsers import Stream, Lattice, Network, Hybrid
from .utils import ( from .utils import (
build_file_path_in_temp_dir, build_file_path_in_temp_dir,
get_page_layout, get_page_layout,
@ -23,6 +23,7 @@ PARSERS = {
"lattice": Lattice, "lattice": Lattice,
"stream": Stream, "stream": Stream,
"network": Network, "network": Network,
"hybrid": Hybrid,
} }
@ -177,7 +178,8 @@ class PDFHandler():
Parameters Parameters
---------- ----------
flavor : str (default: 'lattice') flavor : str (default: 'lattice')
The parsing method to use ('lattice', 'stream', or 'network'). The parsing method to use ('lattice', 'stream', 'network',
or 'hybrid').
Lattice is used by default. Lattice is used by default.
suppress_stdout : str (default: False) suppress_stdout : str (default: False)
Suppress logs and warnings. Suppress logs and warnings.

View File

@ -6,7 +6,9 @@ import cv2
import numpy as np import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(
imagename, process_background=False,
blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
c : int, optional (default: -2) c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
Returns Returns
------- -------
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
if process_background: if process_background:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c
) )
else: else:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def find_lines( def find_lines(
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0 threshold, regions=None,
direction="horizontal", line_scale=15, iterations=0
): ):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -78,7 +84,7 @@ def find_lines(
iterations : int, optional (default: 0) iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
Returns Returns
------- -------
@ -100,13 +106,15 @@ def find_lines(
size = threshold.shape[1] // line_scale size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError("Specify direction as either 'vertical' or 'horizontal'") raise ValueError(
"Specify direction as either 'vertical' or 'horizontal'"
)
if regions is not None: if regions is not None:
region_mask = np.zeros(threshold.shape) region_mask = np.zeros(threshold.shape)
for region in regions: for region in regions:
x, y, w, h = region x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1 region_mask[y:y + h, x:x + w] = 1
threshold = np.multiply(threshold, region_mask) threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el) threshold = cv2.erode(threshold, el)
@ -115,12 +123,14 @@ def find_lines(
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
) )
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
) )
for c in contours: for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
tables = {} tables = {}
for c in contours: for c in contours:
x, y, w, h = c x, y, w, h = c
roi = joints[y : y + h, x : x + w] roi = joints[y:y + h, x:x + w]
try: try:
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE

View File

@ -99,7 +99,7 @@ def read_pdf(
""" """
layout_kwargs = layout_kwargs or {} layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream", "network"]: if flavor not in ["lattice", "stream", "network", "hybrid"]:
raise NotImplementedError( raise NotImplementedError(
"Unknown flavor specified." "Unknown flavor specified."
" Use either 'lattice', 'stream', or 'network'" " Use either 'lattice', 'stream', or 'network'"

View File

@ -3,3 +3,4 @@
from .stream import Stream from .stream import Stream
from .lattice import Lattice from .lattice import Lattice
from .network import Network from .network import Network
from .hybrid import Hybrid

View File

@ -34,8 +34,9 @@ class BaseParser():
self.id = parser_id self.id = parser_id
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.table_bbox = {} self.table_bbox_parses = {}
self.columns = None
self.copy_text = copy_text self.copy_text = copy_text
self.split_text = split_text self.split_text = split_text
self.strip_text = strip_text self.strip_text = strip_text
@ -47,10 +48,18 @@ class BaseParser():
self.t_bbox = None self.t_bbox = None
# For plotting details of parsing algorithms # For plotting details of parsing algorithms
self.resolution = 300 # default plotting resolution of the PDF.
self.parse_details = {} self.parse_details = {}
if not debug: if not debug:
self.parse_details = None self.parse_details = None
def table_bboxes(self):
return sorted(
self.table_bbox_parses.keys(),
key=lambda x: x[1],
reverse=True
)
def prepare_page_parse(self, filename, layout, dimensions, def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs): page_idx, layout_kwargs):
self.filename = filename self.filename = filename
@ -142,6 +151,7 @@ class BaseParser():
table = Table(cols, rows) table = Table(cols, rows)
table.page = self.page table.page = self.page
table.order = table_idx + 1 table.order = table_idx + 1
table._bbox = self.table_bboxes()[table_idx]
return table return table
@staticmethod @staticmethod
@ -177,7 +187,7 @@ class BaseParser():
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
return pos_errors return pos_errors
def _generate_columns_and_rows(self, bbox, table_idx): def _generate_columns_and_rows(self, bbox, user_cols):
# Pure virtual, must be defined by the derived parser # Pure virtual, must be defined by the derived parser
raise NotImplementedError() raise NotImplementedError()
@ -199,20 +209,23 @@ class BaseParser():
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, bbox in enumerate( for table_idx, bbox in enumerate(self.table_bboxes()):
sorted( if self.columns is not None and self.columns[table_idx] != "":
self.table_bbox.keys(), # user has to input boundary columns too
key=lambda x: x[1], # take (0, pdf_width) by default
reverse=True # similar to else condition
) # len can't be 1
): user_cols = self.columns[table_idx].split(",")
user_cols = [float(c) for c in user_cols]
else:
user_cols = None
cols, rows, v_s, h_s = self._generate_columns_and_rows( cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox, bbox,
table_idx user_cols
) )
table = self._generate_table( table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s) table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = bbox
_tables.append(table) _tables.append(table)
return _tables return _tables
@ -222,6 +235,7 @@ class BaseParser():
""" """
table.flavor = self.id table.flavor = self.id
table.filename = self.filename table.filename = self.filename
table.parse = self.table_bbox_parses[table._bbox]
table.parse_details = self.parse_details table.parse_details = self.parse_details
pos_errors = self.compute_parse_errors(table) pos_errors = self.compute_parse_errors(table)
table.accuracy = compute_accuracy([[100, pos_errors]]) table.accuracy = compute_accuracy([[100, pos_errors]])
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns"
" should be equal") " should be equal")
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._bbox = self.table_bbox
table._segments = None
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
self.record_parse_metadata(table) self.record_parse_metadata(table)
return table return table
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._segments = None

View File

@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
from ..utils import (
bboxes_overlap,
boundaries_to_split_lines,
)
from .base import BaseParser
from .network import Network
from .lattice import Lattice
class Hybrid(BaseParser):
"""Defines a hybrid parser, leveraging both network and lattice parsers.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=None,
row_tol=2,
column_tol=0,
debug=False,
**kwargs):
super().__init__(
"hybrid",
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
debug=debug,
)
self.network_parser = Network(
table_regions=table_regions,
table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
self.lattice_parser = Lattice(
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
super().prepare_page_parse(filename, layout, dimensions,
page_idx, layout_kwargs)
self.network_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
self.lattice_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
def _generate_columns_and_rows(self, bbox, table_idx):
parser = self.table_bbox_parses[bbox]
return parser._generate_columns_and_rows(bbox, table_idx)
def _generate_table(self, table_idx, cols, rows, **kwargs):
bbox = self.table_bboxes()[table_idx]
parser = self.table_bbox_parses[bbox]
return parser._generate_table(table_idx, cols, rows, **kwargs)
@staticmethod
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
""" Augment existing boundaries using provided hard splits.
Boundaries: |---| |-| |---------|
Splits: | | | |
Augmented: |-------|-----|-------|--|
"""
idx_boundaries = len(boundaries) - 1
idx_splits = len(splits) - 1
previous_boundary = None
while True:
if idx_splits < 0:
# No more splits to incorporate, we're done
break
split = splits[idx_splits]
if idx_boundaries < 0:
# Need to insert remaining splits
new_boundary = [split, boundaries[0][0]]
boundaries.insert(0, new_boundary)
idx_splits = idx_splits - 1
else:
boundary = \
boundaries[idx_boundaries]
if boundary[1] < \
split + tolerance:
# The lattice column is further to the right of our
# col boundary. We move our left boundary to match.
boundary[1] = split
# And if there was another segment after, we make its
# right boundary match as well so that there's no gap
if previous_boundary is not None:
previous_boundary[0] = split
idx_splits = idx_splits - 1
elif boundary[0] > \
split - tolerance:
# Our boundary is fully after the split, move on
idx_boundaries = idx_boundaries - 1
previous_boundary = boundary
else:
# The split is inside our boundary: split it
new_boundary = [split, boundary[1]]
boundaries.insert(idx_boundaries + 1, new_boundary)
boundary[1] = split
previous_boundary = new_boundary
idx_splits = idx_splits - 1
return boundaries
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
""" Identify splits that were only detected by lattice or by network
"""
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
lattice_cols, lattice_rows = \
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
network_cols_boundaries = network_bbox_data["cols_boundaries"]
# Favor hybrid, but complete or adjust its columns based on the
# splits identified by lattice.
if network_cols_boundaries is None:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
else:
network_cols_boundaries = self._augment_boundaries_with_splits(
network_cols_boundaries, lattice_cols) # self.column_tol???
augmented_bbox = (
network_cols_boundaries[0][0], network_bbox[1],
network_cols_boundaries[-1][1], network_bbox[3],
)
network_bbox_data["cols_anchors"] = \
boundaries_to_split_lines(network_cols_boundaries)
del self.network_parser.table_bbox_parses[network_bbox]
self.network_parser.table_bbox_parses[augmented_bbox] = \
network_bbox_data
self.table_bbox_parses[augmented_bbox] = self.network_parser
def _generate_table_bbox(self):
# Collect bboxes from both parsers
self.lattice_parser._generate_table_bbox()
_lattice_bboxes = sorted(
self.lattice_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
self.network_parser._generate_table_bbox()
_network_bboxes = sorted(
self.network_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
# Merge the data from both processes
for lattice_bbox in _lattice_bboxes:
merged = False
for idx in range(len(_network_bboxes)-1, -1, -1):
network_bbox = _network_bboxes[idx]
if not bboxes_overlap(lattice_bbox, network_bbox):
continue
self._merge_bbox_analysis(lattice_bbox, network_bbox)
# network_bbox_data["cols_boundaries"]
del _network_bboxes[idx]
merged = True
if not merged:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
# Add the bboxes from network that haven't been merged
for network_bbox in _network_bboxes:
self.table_bbox_parses[network_bbox] = self.network_parser
def record_parse_metadata(self, table):
super().record_parse_metadata(table)

View File

@ -2,8 +2,6 @@
from __future__ import division from __future__ import division
import os import os
import copy
from .base import BaseParser from .base import BaseParser
from ..utils import ( from ..utils import (
@ -173,7 +171,6 @@ class Lattice(BaseParser):
super().record_parse_metadata(table) super().record_parse_metadata(table)
# for plotting # for plotting
table._image = self.pdf_image # Reuse the image used for calc table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments) table._segments = (self.vertical_segments, self.horizontal_segments)
def _generate_table_bbox(self): def _generate_table_bbox(self):
@ -193,7 +190,7 @@ class Lattice(BaseParser):
os.path.basename(self.filename), os.path.basename(self.filename),
".png" ".png"
) )
export_pdf_as_png(self.filename, self.image_path) export_pdf_as_png(self.filename, self.image_path, self.resolution)
self.pdf_image, self.threshold = adaptive_threshold( self.pdf_image, self.threshold = adaptive_threshold(
self.image_path, self.image_path,
process_background=self.process_background, process_background=self.process_background,
@ -250,17 +247,59 @@ class Lattice(BaseParser):
areas = scale_areas(self.table_areas) areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask) table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
[ [
self.table_bbox, self.table_bbox_parses,
self.vertical_segments, self.vertical_segments,
self.horizontal_segments self.horizontal_segments
] = scale_image( ] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_bbox, vertical_segments, horizontal_segments, pdf_scalers
) )
def _generate_columns_and_rows(self, bbox, table_idx): for bbox, parse in self.table_bbox_parses.items():
joints = parse["joints"]
# Merge x coordinates that are close together
line_tol = self.line_tol
# Sort the joints, make them a list of lists (instead of sets)
joints_normalized = list(
map(
lambda x: list(x),
sorted(joints, key=lambda j: - j[0])
)
)
for idx in range(1, len(joints_normalized)):
x_left, x_right = \
joints_normalized[idx-1][0], joints_normalized[idx][0]
if x_left - line_tol <= x_right <= x_left + line_tol:
joints_normalized[idx][0] = x_left
# Merge y coordinates that are close together
joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
for idx in range(1, len(joints_normalized)):
y_bottom, y_top = \
joints_normalized[idx-1][1], joints_normalized[idx][1]
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
joints_normalized[idx][1] = y_bottom
# FRHTODO: check this is useful, otherwise get rid of the code
# above
parse["joints_normalized"] = joints_normalized
cols = list(map(lambda coords: coords[0], joints))
cols.extend([bbox[0], bbox[2]])
rows = list(map(lambda coords: coords[1], joints))
rows.extend([bbox[1], bbox[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
parse["col_anchors"] = cols
parse["row_anchors"] = rows
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox # select elements which lie within table_bbox
v_s, h_s = segments_in_bbox( v_s, h_s = segments_in_bbox(
bbox, self.vertical_segments, self.horizontal_segments bbox, self.vertical_segments, self.horizontal_segments
@ -270,21 +309,17 @@ class Lattice(BaseParser):
self.horizontal_text, self.horizontal_text,
self.vertical_text self.vertical_text
) )
parse = self.table_bbox_parses[bbox]
cols, rows = zip(*self.table_bbox[bbox])
cols, rows = list(cols), list(rows)
cols.extend([bbox[0], bbox[2]])
rows.extend([bbox[1], bbox[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] (parse["col_anchors"][i], parse["col_anchors"][i + 1])
for i in range(0, len(parse["col_anchors"]) - 1)
]
rows = [
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
for i in range(0, len(parse["row_anchors"]) - 1)
]
return cols, rows, v_s, h_s return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):

View File

@ -19,7 +19,8 @@ from ..utils import (
text_in_bbox, text_in_bbox,
textlines_overlapping_bbox, textlines_overlapping_bbox,
bbox_from_textlines, bbox_from_textlines,
find_columns_coordinates, find_columns_boundaries,
boundaries_to_split_lines,
text_in_bbox_per_axis, text_in_bbox_per_axis,
) )
@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
tls_search_space.remove(most_aligned_tl) tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl] tls_in_bbox = [most_aligned_tl]
last_bbox = None last_bbox = None
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1] last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
while last_bbox != bbox: while last_bbox != bbox:
if parse_details_search is not None: if parse_details_search is not None:
# Store debug info # Store debug info
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
# of the new row won't reduce the number of columns. # of the new row won't reduce the number of columns.
# This happens when text covers multiple rows - that's only # This happens when text covers multiple rows - that's only
# allowed in the header, treated separately. # allowed in the header, treated separately.
cols_cand = find_columns_coordinates(tls_in_new_box) cols_bounds = find_columns_boundaries(tls_in_new_box)
if direction in ["bottom", "top"] and \ if direction in ["bottom", "top"] and \
len(cols_cand) < len(last_cols_cand): len(cols_bounds) < len(last_cols_bounds):
continue continue
# We have an expansion candidate: register it, update the # We have an expansion candidate: register it, update the
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
# We use bbox_from_textlines instead of cand_bbox in case some # We use bbox_from_textlines instead of cand_bbox in case some
# overlapping textlines require a large bbox for strict fit. # overlapping textlines require a large bbox for strict fit.
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box)) bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
last_cols_cand = cols_cand last_cols_bounds = cols_bounds
tls_in_bbox.extend(new_tls) tls_in_bbox.extend(new_tls)
for i in range(len(tls_search_space) - 1, -1, -1): for i in range(len(tls_search_space) - 1, -1, -1):
textline = tls_search_space[i] textline = tls_search_space[i]
@ -591,7 +592,7 @@ class Network(TextBaseParser):
textlines = self._apply_regions_filter(all_textlines) textlines = self._apply_regions_filter(all_textlines)
textlines_processed = {} textlines_processed = {}
self.table_bbox = {} self.table_bbox_parses = {}
if self.parse_details is not None: if self.parse_details is not None:
parse_details_network_searches = [] parse_details_network_searches = []
self.parse_details["network_searches"] = \ self.parse_details["network_searches"] = \
@ -641,7 +642,8 @@ class Network(TextBaseParser):
# Get all the textlines that overlap with the box, compute # Get all the textlines that overlap with the box, compute
# columns # columns
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines) tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
cols_anchors = find_columns_coordinates(tls_in_bbox) cols_boundaries = find_columns_boundaries(tls_in_bbox)
cols_anchors = boundaries_to_split_lines(cols_boundaries)
# Unless the user gave us strict bbox_body, try to find a header # Unless the user gave us strict bbox_body, try to find a header
# above the body to build the full bbox. # above the body to build the full bbox.
@ -662,10 +664,11 @@ class Network(TextBaseParser):
table_parse = { table_parse = {
"bbox_body": bbox_body, "bbox_body": bbox_body,
"cols_boundaries": cols_boundaries,
"cols_anchors": cols_anchors, "cols_anchors": cols_anchors,
"bbox_full": bbox_full "bbox_full": bbox_full
} }
self.table_bbox[bbox_full] = table_parse self.table_bbox_parses[bbox_full] = table_parse
if self.parse_details is not None: if self.parse_details is not None:
self.parse_details["col_searches"].append(table_parse) self.parse_details["col_searches"].append(table_parse)
@ -678,7 +681,7 @@ class Network(TextBaseParser):
textlines textlines
)) ))
def _generate_columns_and_rows(self, bbox, table_idx): def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox # select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis( self.t_bbox = text_in_bbox_per_axis(
bbox, bbox,
@ -706,18 +709,14 @@ class Network(TextBaseParser):
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol) rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
if self.columns is not None and self.columns[table_idx] != "": if user_cols is not None:
# user has to input boundary columns too cols = [text_x_min] + user_cols + [text_x_max]
# take (0, pdf_width) by default cols = [
# similar to else condition (cols[i], cols[i + 1])
# len can't be 1 for i in range(0, len(cols) - 1)
cols = self.columns[table_idx].split(",") ]
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
parse_details = self.table_bbox[bbox] parse_details = self.table_bbox_parses[bbox]
col_anchors = parse_details["cols_anchors"] col_anchors = parse_details["cols_anchors"]
cols = list(map( cols = list(map(
lambda idx: [col_anchors[idx], col_anchors[idx + 1]], lambda idx: [col_anchors[idx], col_anchors[idx + 1]],

View File

@ -122,14 +122,14 @@ class Stream(TextBaseParser):
self.horizontal_text) self.horizontal_text)
hor_text.extend(region_text) hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm # find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text) table_bbox_parses = self._nurminen_table_detection(hor_text)
else: else:
table_bbox = {} table_bbox_parses = {}
for area_str in self.table_areas: for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None table_bbox_parses[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox self.table_bbox_parses = table_bbox_parses
def _generate_columns_and_rows(self, bbox, table_idx): def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox # select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis( self.t_bbox = text_in_bbox_per_axis(
bbox, bbox,
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"] self.t_bbox["horizontal"] + self.t_bbox["vertical"]
) )
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows( rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol) self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "": if user_cols is not None:
# user has to input boundary columns too cols = [text_x_min] + user_cols + [text_x_max]
# take (0, pdf_width) by default cols = [
# similar to else condition (cols[i], cols[i + 1])
# len can't be 1 for i in range(0, len(cols) - 1)
cols = self.columns[table_idx].split(",") ]
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
# calculate mode of the list of number of elements in # calculate mode of the list of number of elements in
# each row to guess the number of columns # each row to guess the number of columns
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
else: else:
warnings.warn( warnings.warn(
"No tables found in table area {}" "No tables found in table area {bbox}".format(
.format(table_idx + 1) bbox=bbox)
) )
cols = [ cols = [
(t.x0, t.x1) (t.x0, t.x1)

View File

@ -74,7 +74,7 @@ def draw_labeled_bbox(
) )
def draw_pdf(table, ax, to_pdf_scale=True): def draw_pdf(table, ax):
"""Draw the content of the table's source pdf into the passed subplot """Draw the content of the table's source pdf into the passed subplot
Parameters Parameters
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
ax : matplotlib.axes.Axes (optional) ax : matplotlib.axes.Axes (optional)
to_pdf_scale : bool (optional)
""" """
img = table.get_pdf_image() img = table.get_pdf_image()
if to_pdf_scale:
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
else:
ax.imshow(img)
def draw_parse_constraints(table, ax): def draw_parse_constraints(table, ax):
@ -132,8 +127,6 @@ def draw_text(table, ax):
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional) ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
""" """
bbox = bbox_from_textlines(table.textlines) bbox = bbox_from_textlines(table.textlines)
for t in table.textlines: for t in table.textlines:
@ -150,18 +143,14 @@ def draw_text(table, ax):
extend_axe_lim(ax, bbox) extend_axe_lim(ax, bbox)
def prepare_plot(table, ax=None, to_pdf_scale=True): def prepare_plot(table, ax=None):
"""Initialize plot and draw common components """Initialize plot and draw common components
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional) ax : matplotlib.axes.Axes (optional)
to_pdf_scale :
ax : matplotlib.axes.Axes
to_pdf_scale : bool (optional)
Returns Returns
------- -------
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
if ax is None: if ax is None:
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale) draw_pdf(table, ax)
draw_parse_constraints(table, ax) draw_parse_constraints(table, ax)
return ax return ax
@ -186,7 +175,8 @@ class PlotMethods():
table: camelot.core.Table table: camelot.core.Table
A Camelot Table. A Camelot Table.
kind : str, optional (default: 'text') kind : str, optional (default: 'text')
{'text', 'grid', 'contour', 'joint', 'line'} {'text', 'grid', 'contour', 'joint', 'line',
'network_table_search'}
The element type for which a plot should be generated. The element type for which a plot should be generated.
filepath: str, optional (default: None) filepath: str, optional (default: None)
Absolute path for saving the generated plot. Absolute path for saving the generated plot.
@ -203,9 +193,12 @@ class PlotMethods():
raise NotImplementedError( raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind) "Lattice flavor does not support kind='{}'".format(kind)
) )
if table.flavor in ["stream", "network"] and kind in ["line"]: if table.flavor != "lattice" and kind in ["line"]:
raise NotImplementedError( raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind) "{flavor} flavor does not support kind='{kind}'".format(
flavor=table.flavor,
kind=kind
)
) )
plot_method = getattr(self, kind) plot_method = getattr(self, kind)
@ -274,25 +267,21 @@ class PlotMethods():
""" """
_FOR_LATTICE = table.flavor == "lattice" _FOR_LATTICE = table.flavor == "lattice"
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE) ax = prepare_plot(table, ax)
if _FOR_LATTICE:
table_bbox = table._bbox_unscaled
else:
table_bbox = {table._bbox: None}
if not _FOR_LATTICE: if not _FOR_LATTICE:
draw_text(table, ax) draw_text(table, ax)
for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], (table._bbox[0], table._bbox[1]),
table._bbox[2] - table._bbox[0],
table._bbox[3] - table._bbox[1],
fill=False, color="red" fill=False, color="red"
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
extend_axe_lim(ax, t) extend_axe_lim(ax, table._bbox)
return ax.get_figure() return ax.get_figure()
@ -393,12 +382,10 @@ class PlotMethods():
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
ax = prepare_plot(table, ax, to_pdf_scale=False) ax = prepare_plot(table, ax)
table_bbox = table._bbox_unscaled
x_coord = [] x_coord = []
y_coord = [] y_coord = []
for k in table_bbox.keys(): for coord in table.parse["joints"]:
for coord in table_bbox[k]:
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro") ax.plot(x_coord, y_coord, "ro")

View File

@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
j_x, j_y = zip(*tables[k]) j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x] j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y] j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y) tables_new[(x1, y1, x2, y2)] = {
tables_new[(x1, y1, x2, y2)] = joints "joints": list(zip(j_x, j_y))
}
v_segments_new = [] v_segments_new = []
for v in v_segments: for v in v_segments:
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
) )
def bboxes_overlap(bbox1, bbox2):
(left1, bottom1, right1, top1) = bbox1
(left2, bottom2, right2, top2) = bbox2
return (
(left1 < left2 < right1) or (left1 < right2 < right1)
) and (
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
)
def textlines_overlapping_bbox(bbox, textlines): def textlines_overlapping_bbox(bbox, textlines):
"""Returns all text objects which overlap or are within a bounding box. """Returns all text objects which overlap or are within a bounding box.
@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
List of PDFMiner text objects. List of PDFMiner text objects.
""" """
(left, bottom, right, top) = bbox
t_bbox = [ t_bbox = [
t t
for t in textlines for t in textlines
if ((left < t.x0 < right) or (left < t.x1 < right)) if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
] ]
return t_bbox return t_bbox
@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
return bbox return bbox
def find_columns_coordinates(tls, min_gap=1.0): def find_columns_boundaries(tls, min_gap=1.0):
"""Given a list of text objects, guess columns boundaries and returns a """Make a list of disjunct cols boundaries for a list of text objects
list of x-coordinates for split points between columns.
Parameters Parameters
---------- ----------
tls : list of PDFMiner text object. tls : list of PDFMiner text object.
min_gap : minimum distance between columns. Any elements closer than this min_gap : minimum distance between columns. Any elements closer than
threshold are merged together. This is to prevent spaces between words this threshold are merged together. This is to prevent spaces between
to be misinterpreted as column boundaries. words to be misinterpreted as boundaries.
Returns Returns
------- -------
cols_anchors : list boundaries : list
List of x-coordinates for columns. List x-coordinates for cols.
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
""" """
# Make a list of disjunct cols boundaries across the textlines
# that comprise the table.
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
cols_bounds = [] cols_bounds = []
tls.sort(key=lambda tl: tl.x0) tls.sort(key=lambda tl: tl.x0)
for tl in tls: for tl in tls:
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
cols_bounds.append([tl.x0, tl.x1]) cols_bounds.append([tl.x0, tl.x1])
else: else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1) cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
return cols_bounds
def find_rows_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct rows boundaries for a list of text objects
Parameters
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between rows. Any elements closer than
this threshold are merged together.
Returns
-------
boundaries : list
List y-coordinates for rows.
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
"""
rows_bounds = []
tls.sort(key=lambda tl: tl.y0)
for tl in tls:
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
rows_bounds.append([tl.y0, tl.y1])
else:
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
return rows_bounds
def boundaries_to_split_lines(boundaries):
"""Find split lines given a list of boundaries between rows or cols.
Boundaries: [ a ] [b] [ c ] [d]
Splits: | | | | |
Parameters
----------
boundaries : list
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
These are the (left, right most) or (bottom, top most) coordinates.
Returns
-------
anchors : list
List of coordinates representing the split points, each half way
between boundaries
"""
# From the row boundaries, identify splits by getting the mid points # From the row boundaries, identify splits by getting the mid points
# between the boundaries. # between the boundaries.
# Row boundaries: [ a ] [b] [ c ] anchors = list(map(
# Splits: | | | | lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
cols_anchors = list(map( range(1, len(boundaries))
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
range(1, len(cols_bounds))
)) ))
cols_anchors.insert(0, cols_bounds[0][0]) anchors.insert(0, boundaries[0][0])
cols_anchors.append(cols_bounds[-1][1]) anchors.append(boundaries[-1][1])
return cols_anchors return anchors
def get_index_closest_point(point, sorted_list, fn=lambda x: x): def get_index_closest_point(point, sorted_list, fn=lambda x: x):
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
return t return t
def export_pdf_as_png(pdf_path, destination_path): def export_pdf_as_png(pdf_path, destination_path, resolution=300):
"""Generate an image from a pdf. """Generate an image from a pdf.
Parameters Parameters
---------- ----------
pdf_path : str pdf_path : str
destination_path : str destination_path : str
resolution : int
""" """
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\ gs_call = "-q -sDEVICE=png16m -o " \
"{destination_path} -r{resolution} {pdf_path}" \
.format( .format(
destination_path=destination_path, destination_path=destination_path,
resolution=resolution,
pdf_path=pdf_path pdf_path=pdf_path
) )
gs_call = gs_call.encode().split() gs_call = gs_call.encode().split()

File diff suppressed because one or more lines are too long

View File

@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
], ],
] ]
# Compared to network, hybrid detects additional sparse columns
data_hybrid_vertical_headers = [
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"Congress-",
"",
"",
"Senator 36th",
"",
"Rep106th",
"",
"Reg. of",
"",
"Road",
"",
"",
"Distri",
"Dist",
"",
"",
"Dist",
],
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"1st Dist",
"Dist.",
"",
"",
"Dist.",
"Deeds",
"",
"Commission",
"",
"District #1",
"",
"ct #2",
"#3",
"Dist #4",
"",
"#5",
],
[
"",
"",
"",
"",
"",
"Governor",
"",
"",
"U.S. Senator",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Number of Registered voters",
"Poll Book Totals",
"Brian Calley",
"Patrick Colbeck",
"Jim Hines",
"Bill Schuette",
"John James",
"Sandy Pensler",
"",
"Jack Bergman",
"",
"Jim Stamas",
"",
"Sue Allor",
"",
"Melissa A. Cordes",
"",
"Al Scully",
"",
"Daniel G. Gauthier",
"Craig M. Clemens",
"Craig Johnston",
"Carolyn Brummund",
"Adam Brege",
"David Bielusiak",
"",
],
[
"Alcona",
"963",
"439",
"55",
"26",
"47",
"164",
"173",
"111",
"",
"268",
"",
"272",
"",
"275",
"",
"269",
"",
"271",
"",
"224",
"76",
"",
"",
"",
"",
"",
],
[
"Caledonia",
"923",
"393",
"40",
"23",
"45",
"158",
"150",
"103",
"",
"244",
"",
"247",
"",
"254",
"",
"255",
"",
"244",
"",
"139",
"143",
"",
"",
"",
"",
"",
],
[
"Curtis",
"1026",
"349",
"30",
"30",
"25",
"102",
"95",
"84",
"",
"159",
"",
"164",
"",
"162",
"",
"161",
"",
"157",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"Greenbush",
"1212",
"423",
"56",
"26",
"40",
"126",
"104",
"131",
"",
"208",
"",
"213",
"",
"214",
"",
"215",
"",
"208",
"",
"",
"",
"",
"208",
"",
"",
"",
],
[
"Gustin",
"611",
"180",
"22",
"35",
"17",
"55",
"73",
"45",
"",
"108",
"",
"104",
"",
"111",
"",
"111",
"",
"109",
"",
"",
"",
"",
"",
"81",
"42",
"",
],
[
"Harrisville",
"1142",
"430",
"45",
"90",
"29",
"101",
"155",
"94",
"",
"226",
"",
"226",
"",
"232",
"",
"244",
"",
"226",
"",
"",
"",
"232",
"",
"",
"",
"",
],
[
"Hawes",
"884",
"293",
"38",
"36",
"27",
"109",
"121",
"84",
"",
"192",
"",
"195",
"",
"195",
"",
"193",
"",
"184",
"",
"",
"",
"",
"",
"118",
"87",
"",
],
[
"Haynes",
"626",
"275",
"31",
"20",
"32",
"104",
"121",
"53",
"",
"163",
"",
"163",
"",
"173",
"",
"161",
"",
"152",
"",
"",
"",
"76",
"",
"69",
"31",
"",
],
[
"Mikado",
"781",
"208",
"19",
"39",
"17",
"81",
"90",
"63",
"",
"149",
"",
"149",
"",
"145",
"",
"147",
"",
"143",
"",
"",
"",
"",
"113",
"",
"",
"",
],
[
"Millen",
"353",
"139",
"7",
"16",
"13",
"38",
"49",
"19",
"",
"62",
"",
"66",
"",
"67",
"",
"66",
"",
"62",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"Mitchell",
"327",
"96",
"12",
"17",
"7",
"29",
"41",
"17",
"",
"57",
"",
"55",
"",
"57",
"",
"60",
"",
"56",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"City Harrisville",
"389",
"171",
"16",
"15",
"18",
"35",
"49",
"31",
"",
"78",
"",
"80",
"",
"82",
"",
"81",
"",
"77",
"",
"",
"",
"73",
"",
"",
"",
"",
],
[
"Totals",
"9237",
"3396",
"371",
"373",
"317",
"1102",
"1221",
"835",
"0",
"1914",
"0",
"1934",
"",
"1967",
"",
"1963",
"0",
"1889",
"0",
"363",
"219",
"381",
"321",
"268",
"160",
"0",
],
]
data_stream_table_areas = [ data_stream_table_areas = [

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 46 KiB

View File

@ -285,6 +285,19 @@ def test_network_layout_kwargs():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
# Hybrid parser
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
# Lattice parser tests
def test_lattice(): def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)