WIP: Introduce actual hybrid parser
Create hybrid parser leverage both lattice and network techniques. Simplify plotting of pdf in lattice case. Rename "parser.table_bbox" into "parser.table_bbox_parses", since it represents not a bbox but a dict of bbox to corresponding parsing data. Still missing: more unit tests, plotting of steps.pull/153/head
parent
6711f877bf
commit
77d289bd86
|
|
@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
|
||||||
"Please specify output file format using --format")
|
"Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="network",
|
||||||
|
suppress_stdout=quiet, **kwargs
|
||||||
)
|
)
|
||||||
click.echo("Found {} tables".format(tables.n))
|
click.echo("Found {} tables".format(tables.n))
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
|
|
|
||||||
|
|
@ -454,7 +454,9 @@ class Table():
|
||||||
self.page = None
|
self.page = None
|
||||||
self.flavor = None # Flavor of the parser used
|
self.flavor = None # Flavor of the parser used
|
||||||
self.pdf_size = None # Dimensions of the original PDF page
|
self.pdf_size = None # Dimensions of the original PDF page
|
||||||
self.parse_details = None # Field holding debug data
|
self._bbox = None # Bounding box in original document
|
||||||
|
self.parse = None # Parse information
|
||||||
|
self.parse_details = None # Field holding extra debug data
|
||||||
|
|
||||||
self._image = None
|
self._image = None
|
||||||
self._image_path = None # Temporary file to hold an image of the pdf
|
self._image_path = None # Temporary file to hold an image of the pdf
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import logging
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice, Network
|
from .parsers import Stream, Lattice, Network, Hybrid
|
||||||
from .utils import (
|
from .utils import (
|
||||||
build_file_path_in_temp_dir,
|
build_file_path_in_temp_dir,
|
||||||
get_page_layout,
|
get_page_layout,
|
||||||
|
|
@ -23,6 +23,7 @@ PARSERS = {
|
||||||
"lattice": Lattice,
|
"lattice": Lattice,
|
||||||
"stream": Stream,
|
"stream": Stream,
|
||||||
"network": Network,
|
"network": Network,
|
||||||
|
"hybrid": Hybrid,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -177,7 +178,8 @@ class PDFHandler():
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice', 'stream', or 'network').
|
The parsing method to use ('lattice', 'stream', 'network',
|
||||||
|
or 'hybrid').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,9 @@ import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
def adaptive_threshold(
|
||||||
|
imagename, process_background=False,
|
||||||
|
blocksize=15, c=-2):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
c : int, optional (default: -2)
|
c : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
if process_background:
|
if process_background:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
gray,
|
||||||
|
255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY, blocksize, c
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
|
|
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
|
|
||||||
def find_lines(
|
def find_lines(
|
||||||
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
threshold, regions=None,
|
||||||
|
direction="horizontal", line_scale=15, iterations=0
|
||||||
):
|
):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
@ -78,7 +84,7 @@ def find_lines(
|
||||||
iterations : int, optional (default: 0)
|
iterations : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -100,7 +106,9 @@ def find_lines(
|
||||||
size = threshold.shape[1] // line_scale
|
size = threshold.shape[1] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
elif direction is None:
|
elif direction is None:
|
||||||
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
|
raise ValueError(
|
||||||
|
"Specify direction as either 'vertical' or 'horizontal'"
|
||||||
|
)
|
||||||
|
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
region_mask = np.zeros(threshold.shape)
|
region_mask = np.zeros(threshold.shape)
|
||||||
|
|
@ -115,12 +123,14 @@ def find_lines(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||||
|
cv2.CHAIN_APPROX_SIMPLE
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||||
|
cv2.CHAIN_APPROX_SIMPLE
|
||||||
)
|
)
|
||||||
|
|
||||||
for c in contours:
|
for c in contours:
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ def read_pdf(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
layout_kwargs = layout_kwargs or {}
|
layout_kwargs = layout_kwargs or {}
|
||||||
if flavor not in ["lattice", "stream", "network"]:
|
if flavor not in ["lattice", "stream", "network", "hybrid"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Unknown flavor specified."
|
"Unknown flavor specified."
|
||||||
" Use either 'lattice', 'stream', or 'network'"
|
" Use either 'lattice', 'stream', or 'network'"
|
||||||
|
|
|
||||||
|
|
@ -3,3 +3,4 @@
|
||||||
from .stream import Stream
|
from .stream import Stream
|
||||||
from .lattice import Lattice
|
from .lattice import Lattice
|
||||||
from .network import Network
|
from .network import Network
|
||||||
|
from .hybrid import Hybrid
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,9 @@ class BaseParser():
|
||||||
self.id = parser_id
|
self.id = parser_id
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.table_bbox = {}
|
self.table_bbox_parses = {}
|
||||||
|
|
||||||
|
self.columns = None
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.strip_text = strip_text
|
self.strip_text = strip_text
|
||||||
|
|
@ -47,10 +48,18 @@ class BaseParser():
|
||||||
self.t_bbox = None
|
self.t_bbox = None
|
||||||
|
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
|
self.resolution = 300 # default plotting resolution of the PDF.
|
||||||
self.parse_details = {}
|
self.parse_details = {}
|
||||||
if not debug:
|
if not debug:
|
||||||
self.parse_details = None
|
self.parse_details = None
|
||||||
|
|
||||||
|
def table_bboxes(self):
|
||||||
|
return sorted(
|
||||||
|
self.table_bbox_parses.keys(),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
def prepare_page_parse(self, filename, layout, dimensions,
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
page_idx, layout_kwargs):
|
page_idx, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
|
@ -142,6 +151,7 @@ class BaseParser():
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table.page = self.page
|
table.page = self.page
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
|
table._bbox = self.table_bboxes()[table_idx]
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -177,7 +187,7 @@ class BaseParser():
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
return pos_errors
|
return pos_errors
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# Pure virtual, must be defined by the derived parser
|
# Pure virtual, must be defined by the derived parser
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
@ -199,20 +209,23 @@ class BaseParser():
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, bbox in enumerate(
|
for table_idx, bbox in enumerate(self.table_bboxes()):
|
||||||
sorted(
|
if self.columns is not None and self.columns[table_idx] != "":
|
||||||
self.table_bbox.keys(),
|
# user has to input boundary columns too
|
||||||
key=lambda x: x[1],
|
# take (0, pdf_width) by default
|
||||||
reverse=True
|
# similar to else condition
|
||||||
)
|
# len can't be 1
|
||||||
):
|
user_cols = self.columns[table_idx].split(",")
|
||||||
|
user_cols = [float(c) for c in user_cols]
|
||||||
|
else:
|
||||||
|
user_cols = None
|
||||||
|
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||||
bbox,
|
bbox,
|
||||||
table_idx
|
user_cols
|
||||||
)
|
)
|
||||||
table = self._generate_table(
|
table = self._generate_table(
|
||||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
table._bbox = bbox
|
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
return _tables
|
return _tables
|
||||||
|
|
@ -222,6 +235,7 @@ class BaseParser():
|
||||||
"""
|
"""
|
||||||
table.flavor = self.id
|
table.flavor = self.id
|
||||||
table.filename = self.filename
|
table.filename = self.filename
|
||||||
|
table.parse = self.table_bbox_parses[table._bbox]
|
||||||
table.parse_details = self.parse_details
|
table.parse_details = self.parse_details
|
||||||
pos_errors = self.compute_parse_errors(table)
|
pos_errors = self.compute_parse_errors(table)
|
||||||
table.accuracy = compute_accuracy([[100, pos_errors]])
|
table.accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
|
||||||
raise ValueError("Length of table_areas and columns"
|
raise ValueError("Length of table_areas and columns"
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
def record_parse_metadata(self, table):
|
|
||||||
"""Record data about the origin of the table
|
|
||||||
"""
|
|
||||||
super().record_parse_metadata(table)
|
|
||||||
# for plotting
|
|
||||||
table._bbox = self.table_bbox
|
|
||||||
table._segments = None
|
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
self.record_parse_metadata(table)
|
self.record_parse_metadata(table)
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
# for plotting
|
||||||
|
table._segments = None
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,221 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from ..utils import (
|
||||||
|
bboxes_overlap,
|
||||||
|
boundaries_to_split_lines,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .base import BaseParser
|
||||||
|
from .network import Network
|
||||||
|
from .lattice import Lattice
|
||||||
|
|
||||||
|
|
||||||
|
class Hybrid(BaseParser):
|
||||||
|
"""Defines a hybrid parser, leveraging both network and lattice parsers.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
table_areas : list, optional (default: None)
|
||||||
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
|
columns : list, optional (default: None)
|
||||||
|
List of column x-coordinates strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Split text that spans across multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Flag text based on font size. Useful to detect
|
||||||
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
edge_tol : int, optional (default: 50)
|
||||||
|
Tolerance parameter for extending textedges vertically.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to combine text vertically,
|
||||||
|
to generate rows.
|
||||||
|
column_tol : int, optional (default: 0)
|
||||||
|
Tolerance parameter used to combine text horizontally,
|
||||||
|
to generate columns.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=None,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(
|
||||||
|
"hybrid",
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
self.network_parser = Network(
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
columns=columns,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
self.lattice_parser = Lattice(
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
flag_size=flag_size,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs):
|
||||||
|
super().prepare_page_parse(filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs)
|
||||||
|
self.network_parser.prepare_page_parse(
|
||||||
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||||
|
self.lattice_parser.prepare_page_parse(
|
||||||
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
|
parser = self.table_bbox_parses[bbox]
|
||||||
|
return parser._generate_columns_and_rows(bbox, table_idx)
|
||||||
|
|
||||||
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
|
bbox = self.table_bboxes()[table_idx]
|
||||||
|
parser = self.table_bbox_parses[bbox]
|
||||||
|
return parser._generate_table(table_idx, cols, rows, **kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
||||||
|
""" Augment existing boundaries using provided hard splits.
|
||||||
|
|
||||||
|
Boundaries: |---| |-| |---------|
|
||||||
|
Splits: | | | |
|
||||||
|
Augmented: |-------|-----|-------|--|
|
||||||
|
"""
|
||||||
|
idx_boundaries = len(boundaries) - 1
|
||||||
|
idx_splits = len(splits) - 1
|
||||||
|
previous_boundary = None
|
||||||
|
while True:
|
||||||
|
if idx_splits < 0:
|
||||||
|
# No more splits to incorporate, we're done
|
||||||
|
break
|
||||||
|
split = splits[idx_splits]
|
||||||
|
|
||||||
|
if idx_boundaries < 0:
|
||||||
|
# Need to insert remaining splits
|
||||||
|
new_boundary = [split, boundaries[0][0]]
|
||||||
|
boundaries.insert(0, new_boundary)
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
else:
|
||||||
|
boundary = \
|
||||||
|
boundaries[idx_boundaries]
|
||||||
|
if boundary[1] < \
|
||||||
|
split + tolerance:
|
||||||
|
# The lattice column is further to the right of our
|
||||||
|
# col boundary. We move our left boundary to match.
|
||||||
|
boundary[1] = split
|
||||||
|
# And if there was another segment after, we make its
|
||||||
|
# right boundary match as well so that there's no gap
|
||||||
|
if previous_boundary is not None:
|
||||||
|
previous_boundary[0] = split
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
elif boundary[0] > \
|
||||||
|
split - tolerance:
|
||||||
|
# Our boundary is fully after the split, move on
|
||||||
|
idx_boundaries = idx_boundaries - 1
|
||||||
|
previous_boundary = boundary
|
||||||
|
else:
|
||||||
|
# The split is inside our boundary: split it
|
||||||
|
new_boundary = [split, boundary[1]]
|
||||||
|
boundaries.insert(idx_boundaries + 1, new_boundary)
|
||||||
|
boundary[1] = split
|
||||||
|
previous_boundary = new_boundary
|
||||||
|
idx_splits = idx_splits - 1
|
||||||
|
return boundaries
|
||||||
|
|
||||||
|
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
|
||||||
|
""" Identify splits that were only detected by lattice or by network
|
||||||
|
"""
|
||||||
|
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
||||||
|
lattice_cols, lattice_rows = \
|
||||||
|
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
|
||||||
|
|
||||||
|
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
||||||
|
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
||||||
|
|
||||||
|
# Favor hybrid, but complete or adjust its columns based on the
|
||||||
|
# splits identified by lattice.
|
||||||
|
if network_cols_boundaries is None:
|
||||||
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||||
|
else:
|
||||||
|
network_cols_boundaries = self._augment_boundaries_with_splits(
|
||||||
|
network_cols_boundaries, lattice_cols) # self.column_tol???
|
||||||
|
augmented_bbox = (
|
||||||
|
network_cols_boundaries[0][0], network_bbox[1],
|
||||||
|
network_cols_boundaries[-1][1], network_bbox[3],
|
||||||
|
)
|
||||||
|
network_bbox_data["cols_anchors"] = \
|
||||||
|
boundaries_to_split_lines(network_cols_boundaries)
|
||||||
|
|
||||||
|
del self.network_parser.table_bbox_parses[network_bbox]
|
||||||
|
self.network_parser.table_bbox_parses[augmented_bbox] = \
|
||||||
|
network_bbox_data
|
||||||
|
self.table_bbox_parses[augmented_bbox] = self.network_parser
|
||||||
|
|
||||||
|
def _generate_table_bbox(self):
|
||||||
|
# Collect bboxes from both parsers
|
||||||
|
self.lattice_parser._generate_table_bbox()
|
||||||
|
_lattice_bboxes = sorted(
|
||||||
|
self.lattice_parser.table_bbox_parses,
|
||||||
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||||
|
self.network_parser._generate_table_bbox()
|
||||||
|
_network_bboxes = sorted(
|
||||||
|
self.network_parser.table_bbox_parses,
|
||||||
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||||
|
|
||||||
|
# Merge the data from both processes
|
||||||
|
for lattice_bbox in _lattice_bboxes:
|
||||||
|
merged = False
|
||||||
|
|
||||||
|
for idx in range(len(_network_bboxes)-1, -1, -1):
|
||||||
|
network_bbox = _network_bboxes[idx]
|
||||||
|
if not bboxes_overlap(lattice_bbox, network_bbox):
|
||||||
|
continue
|
||||||
|
self._merge_bbox_analysis(lattice_bbox, network_bbox)
|
||||||
|
# network_bbox_data["cols_boundaries"]
|
||||||
|
del _network_bboxes[idx]
|
||||||
|
merged = True
|
||||||
|
if not merged:
|
||||||
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||||
|
|
||||||
|
# Add the bboxes from network that haven't been merged
|
||||||
|
for network_bbox in _network_bboxes:
|
||||||
|
self.table_bbox_parses[network_bbox] = self.network_parser
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import copy
|
|
||||||
|
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
|
@ -173,7 +171,6 @@ class Lattice(BaseParser):
|
||||||
super().record_parse_metadata(table)
|
super().record_parse_metadata(table)
|
||||||
# for plotting
|
# for plotting
|
||||||
table._image = self.pdf_image # Reuse the image used for calc
|
table._image = self.pdf_image # Reuse the image used for calc
|
||||||
table._bbox_unscaled = self.table_bbox_unscaled
|
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
|
|
@ -193,7 +190,7 @@ class Lattice(BaseParser):
|
||||||
os.path.basename(self.filename),
|
os.path.basename(self.filename),
|
||||||
".png"
|
".png"
|
||||||
)
|
)
|
||||||
export_pdf_as_png(self.filename, self.image_path)
|
export_pdf_as_png(self.filename, self.image_path, self.resolution)
|
||||||
self.pdf_image, self.threshold = adaptive_threshold(
|
self.pdf_image, self.threshold = adaptive_threshold(
|
||||||
self.image_path,
|
self.image_path,
|
||||||
process_background=self.process_background,
|
process_background=self.process_background,
|
||||||
|
|
@ -250,17 +247,59 @@ class Lattice(BaseParser):
|
||||||
areas = scale_areas(self.table_areas)
|
areas = scale_areas(self.table_areas)
|
||||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||||
|
|
||||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
|
||||||
|
|
||||||
[
|
[
|
||||||
self.table_bbox,
|
self.table_bbox_parses,
|
||||||
self.vertical_segments,
|
self.vertical_segments,
|
||||||
self.horizontal_segments
|
self.horizontal_segments
|
||||||
] = scale_image(
|
] = scale_image(
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
for bbox, parse in self.table_bbox_parses.items():
|
||||||
|
joints = parse["joints"]
|
||||||
|
|
||||||
|
# Merge x coordinates that are close together
|
||||||
|
line_tol = self.line_tol
|
||||||
|
# Sort the joints, make them a list of lists (instead of sets)
|
||||||
|
joints_normalized = list(
|
||||||
|
map(
|
||||||
|
lambda x: list(x),
|
||||||
|
sorted(joints, key=lambda j: - j[0])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for idx in range(1, len(joints_normalized)):
|
||||||
|
x_left, x_right = \
|
||||||
|
joints_normalized[idx-1][0], joints_normalized[idx][0]
|
||||||
|
if x_left - line_tol <= x_right <= x_left + line_tol:
|
||||||
|
joints_normalized[idx][0] = x_left
|
||||||
|
|
||||||
|
# Merge y coordinates that are close together
|
||||||
|
joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
|
||||||
|
for idx in range(1, len(joints_normalized)):
|
||||||
|
y_bottom, y_top = \
|
||||||
|
joints_normalized[idx-1][1], joints_normalized[idx][1]
|
||||||
|
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
|
||||||
|
joints_normalized[idx][1] = y_bottom
|
||||||
|
|
||||||
|
# FRHTODO: check this is useful, otherwise get rid of the code
|
||||||
|
# above
|
||||||
|
parse["joints_normalized"] = joints_normalized
|
||||||
|
|
||||||
|
cols = list(map(lambda coords: coords[0], joints))
|
||||||
|
cols.extend([bbox[0], bbox[2]])
|
||||||
|
rows = list(map(lambda coords: coords[1], joints))
|
||||||
|
rows.extend([bbox[1], bbox[3]])
|
||||||
|
|
||||||
|
# sort horizontal and vertical segments
|
||||||
|
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||||
|
rows = merge_close_lines(
|
||||||
|
sorted(rows, reverse=True),
|
||||||
|
line_tol=self.line_tol
|
||||||
|
)
|
||||||
|
parse["col_anchors"] = cols
|
||||||
|
parse["row_anchors"] = rows
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
bbox, self.vertical_segments, self.horizontal_segments
|
bbox, self.vertical_segments, self.horizontal_segments
|
||||||
|
|
@ -270,21 +309,17 @@ class Lattice(BaseParser):
|
||||||
self.horizontal_text,
|
self.horizontal_text,
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
parse = self.table_bbox_parses[bbox]
|
||||||
|
|
||||||
cols, rows = zip(*self.table_bbox[bbox])
|
|
||||||
cols, rows = list(cols), list(rows)
|
|
||||||
cols.extend([bbox[0], bbox[2]])
|
|
||||||
rows.extend([bbox[1], bbox[3]])
|
|
||||||
# sort horizontal and vertical segments
|
|
||||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
|
||||||
rows = merge_close_lines(
|
|
||||||
sorted(rows, reverse=True),
|
|
||||||
line_tol=self.line_tol
|
|
||||||
)
|
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
cols = [
|
||||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
(parse["col_anchors"][i], parse["col_anchors"][i + 1])
|
||||||
|
for i in range(0, len(parse["col_anchors"]) - 1)
|
||||||
|
]
|
||||||
|
rows = [
|
||||||
|
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
|
||||||
|
for i in range(0, len(parse["row_anchors"]) - 1)
|
||||||
|
]
|
||||||
return cols, rows, v_s, h_s
|
return cols, rows, v_s, h_s
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,8 @@ from ..utils import (
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
textlines_overlapping_bbox,
|
textlines_overlapping_bbox,
|
||||||
bbox_from_textlines,
|
bbox_from_textlines,
|
||||||
find_columns_coordinates,
|
find_columns_boundaries,
|
||||||
|
boundaries_to_split_lines,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
|
||||||
tls_search_space.remove(most_aligned_tl)
|
tls_search_space.remove(most_aligned_tl)
|
||||||
tls_in_bbox = [most_aligned_tl]
|
tls_in_bbox = [most_aligned_tl]
|
||||||
last_bbox = None
|
last_bbox = None
|
||||||
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
|
last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
|
||||||
while last_bbox != bbox:
|
while last_bbox != bbox:
|
||||||
if parse_details_search is not None:
|
if parse_details_search is not None:
|
||||||
# Store debug info
|
# Store debug info
|
||||||
|
|
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
|
||||||
# of the new row won't reduce the number of columns.
|
# of the new row won't reduce the number of columns.
|
||||||
# This happens when text covers multiple rows - that's only
|
# This happens when text covers multiple rows - that's only
|
||||||
# allowed in the header, treated separately.
|
# allowed in the header, treated separately.
|
||||||
cols_cand = find_columns_coordinates(tls_in_new_box)
|
cols_bounds = find_columns_boundaries(tls_in_new_box)
|
||||||
if direction in ["bottom", "top"] and \
|
if direction in ["bottom", "top"] and \
|
||||||
len(cols_cand) < len(last_cols_cand):
|
len(cols_bounds) < len(last_cols_bounds):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# We have an expansion candidate: register it, update the
|
# We have an expansion candidate: register it, update the
|
||||||
|
|
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
|
||||||
# We use bbox_from_textlines instead of cand_bbox in case some
|
# We use bbox_from_textlines instead of cand_bbox in case some
|
||||||
# overlapping textlines require a large bbox for strict fit.
|
# overlapping textlines require a large bbox for strict fit.
|
||||||
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
||||||
last_cols_cand = cols_cand
|
last_cols_bounds = cols_bounds
|
||||||
tls_in_bbox.extend(new_tls)
|
tls_in_bbox.extend(new_tls)
|
||||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||||
textline = tls_search_space[i]
|
textline = tls_search_space[i]
|
||||||
|
|
@ -591,7 +592,7 @@ class Network(TextBaseParser):
|
||||||
textlines = self._apply_regions_filter(all_textlines)
|
textlines = self._apply_regions_filter(all_textlines)
|
||||||
|
|
||||||
textlines_processed = {}
|
textlines_processed = {}
|
||||||
self.table_bbox = {}
|
self.table_bbox_parses = {}
|
||||||
if self.parse_details is not None:
|
if self.parse_details is not None:
|
||||||
parse_details_network_searches = []
|
parse_details_network_searches = []
|
||||||
self.parse_details["network_searches"] = \
|
self.parse_details["network_searches"] = \
|
||||||
|
|
@ -641,7 +642,8 @@ class Network(TextBaseParser):
|
||||||
# Get all the textlines that overlap with the box, compute
|
# Get all the textlines that overlap with the box, compute
|
||||||
# columns
|
# columns
|
||||||
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
|
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
|
||||||
cols_anchors = find_columns_coordinates(tls_in_bbox)
|
cols_boundaries = find_columns_boundaries(tls_in_bbox)
|
||||||
|
cols_anchors = boundaries_to_split_lines(cols_boundaries)
|
||||||
|
|
||||||
# Unless the user gave us strict bbox_body, try to find a header
|
# Unless the user gave us strict bbox_body, try to find a header
|
||||||
# above the body to build the full bbox.
|
# above the body to build the full bbox.
|
||||||
|
|
@ -662,10 +664,11 @@ class Network(TextBaseParser):
|
||||||
|
|
||||||
table_parse = {
|
table_parse = {
|
||||||
"bbox_body": bbox_body,
|
"bbox_body": bbox_body,
|
||||||
|
"cols_boundaries": cols_boundaries,
|
||||||
"cols_anchors": cols_anchors,
|
"cols_anchors": cols_anchors,
|
||||||
"bbox_full": bbox_full
|
"bbox_full": bbox_full
|
||||||
}
|
}
|
||||||
self.table_bbox[bbox_full] = table_parse
|
self.table_bbox_parses[bbox_full] = table_parse
|
||||||
|
|
||||||
if self.parse_details is not None:
|
if self.parse_details is not None:
|
||||||
self.parse_details["col_searches"].append(table_parse)
|
self.parse_details["col_searches"].append(table_parse)
|
||||||
|
|
@ -678,7 +681,7 @@ class Network(TextBaseParser):
|
||||||
textlines
|
textlines
|
||||||
))
|
))
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
bbox,
|
bbox,
|
||||||
|
|
@ -706,18 +709,14 @@ class Network(TextBaseParser):
|
||||||
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
if user_cols is not None:
|
||||||
# user has to input boundary columns too
|
cols = [text_x_min] + user_cols + [text_x_max]
|
||||||
# take (0, pdf_width) by default
|
cols = [
|
||||||
# similar to else condition
|
(cols[i], cols[i + 1])
|
||||||
# len can't be 1
|
for i in range(0, len(cols) - 1)
|
||||||
cols = self.columns[table_idx].split(",")
|
]
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
else:
|
||||||
parse_details = self.table_bbox[bbox]
|
parse_details = self.table_bbox_parses[bbox]
|
||||||
col_anchors = parse_details["cols_anchors"]
|
col_anchors = parse_details["cols_anchors"]
|
||||||
cols = list(map(
|
cols = list(map(
|
||||||
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
|
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
|
||||||
|
|
|
||||||
|
|
@ -122,14 +122,14 @@ class Stream(TextBaseParser):
|
||||||
self.horizontal_text)
|
self.horizontal_text)
|
||||||
hor_text.extend(region_text)
|
hor_text.extend(region_text)
|
||||||
# find tables based on nurminen's detection algorithm
|
# find tables based on nurminen's detection algorithm
|
||||||
table_bbox = self._nurminen_table_detection(hor_text)
|
table_bbox_parses = self._nurminen_table_detection(hor_text)
|
||||||
else:
|
else:
|
||||||
table_bbox = {}
|
table_bbox_parses = {}
|
||||||
for area_str in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
table_bbox[bbox_from_str(area_str)] = None
|
table_bbox_parses[bbox_from_str(area_str)] = None
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox_parses = table_bbox_parses
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
bbox,
|
bbox,
|
||||||
|
|
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
)
|
)
|
||||||
# FRHTODO:
|
|
||||||
# This algorithm takes the horizontal textlines in the bbox, and groups
|
|
||||||
# them into rows based on their bottom y0.
|
|
||||||
# That's wrong: it misses the vertical items, and misses out on all
|
|
||||||
# the alignment identification work we've done earlier.
|
|
||||||
rows_grouped = self._group_rows(
|
rows_grouped = self._group_rows(
|
||||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
if user_cols is not None:
|
||||||
# user has to input boundary columns too
|
cols = [text_x_min] + user_cols + [text_x_max]
|
||||||
# take (0, pdf_width) by default
|
cols = [
|
||||||
# similar to else condition
|
(cols[i], cols[i + 1])
|
||||||
# len can't be 1
|
for i in range(0, len(cols) - 1)
|
||||||
cols = self.columns[table_idx].split(",")
|
]
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
else:
|
||||||
# calculate mode of the list of number of elements in
|
# calculate mode of the list of number of elements in
|
||||||
# each row to guess the number of columns
|
# each row to guess the number of columns
|
||||||
|
|
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No tables found in table area {}"
|
"No tables found in table area {bbox}".format(
|
||||||
.format(table_idx + 1)
|
bbox=bbox)
|
||||||
)
|
)
|
||||||
cols = [
|
cols = [
|
||||||
(t.x0, t.x1)
|
(t.x0, t.x1)
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ def draw_labeled_bbox(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def draw_pdf(table, ax, to_pdf_scale=True):
|
def draw_pdf(table, ax):
|
||||||
"""Draw the content of the table's source pdf into the passed subplot
|
"""Draw the content of the table's source pdf into the passed subplot
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes (optional)
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
to_pdf_scale : bool (optional)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img = table.get_pdf_image()
|
img = table.get_pdf_image()
|
||||||
if to_pdf_scale:
|
|
||||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
else:
|
|
||||||
ax.imshow(img)
|
|
||||||
|
|
||||||
|
|
||||||
def draw_parse_constraints(table, ax):
|
def draw_parse_constraints(table, ax):
|
||||||
|
|
@ -132,8 +127,6 @@ def draw_text(table, ax):
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
ax : matplotlib.axes.Axes (optional)
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
bbox = bbox_from_textlines(table.textlines)
|
bbox = bbox_from_textlines(table.textlines)
|
||||||
for t in table.textlines:
|
for t in table.textlines:
|
||||||
|
|
@ -150,18 +143,14 @@ def draw_text(table, ax):
|
||||||
extend_axe_lim(ax, bbox)
|
extend_axe_lim(ax, bbox)
|
||||||
|
|
||||||
|
|
||||||
def prepare_plot(table, ax=None, to_pdf_scale=True):
|
def prepare_plot(table, ax=None):
|
||||||
"""Initialize plot and draw common components
|
"""Initialize plot and draw common components
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes (optional)
|
ax : matplotlib.axes.Axes (optional)
|
||||||
to_pdf_scale :
|
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes
|
|
||||||
|
|
||||||
to_pdf_scale : bool (optional)
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
|
||||||
if ax is None:
|
if ax is None:
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
draw_pdf(table, ax, to_pdf_scale)
|
draw_pdf(table, ax)
|
||||||
draw_parse_constraints(table, ax)
|
draw_parse_constraints(table, ax)
|
||||||
return ax
|
return ax
|
||||||
|
|
||||||
|
|
@ -186,7 +175,8 @@ class PlotMethods():
|
||||||
table: camelot.core.Table
|
table: camelot.core.Table
|
||||||
A Camelot Table.
|
A Camelot Table.
|
||||||
kind : str, optional (default: 'text')
|
kind : str, optional (default: 'text')
|
||||||
{'text', 'grid', 'contour', 'joint', 'line'}
|
{'text', 'grid', 'contour', 'joint', 'line',
|
||||||
|
'network_table_search'}
|
||||||
The element type for which a plot should be generated.
|
The element type for which a plot should be generated.
|
||||||
filepath: str, optional (default: None)
|
filepath: str, optional (default: None)
|
||||||
Absolute path for saving the generated plot.
|
Absolute path for saving the generated plot.
|
||||||
|
|
@ -203,9 +193,12 @@ class PlotMethods():
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Lattice flavor does not support kind='{}'".format(kind)
|
"Lattice flavor does not support kind='{}'".format(kind)
|
||||||
)
|
)
|
||||||
if table.flavor in ["stream", "network"] and kind in ["line"]:
|
if table.flavor != "lattice" and kind in ["line"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Stream flavor does not support kind='{}'".format(kind)
|
"{flavor} flavor does not support kind='{kind}'".format(
|
||||||
|
flavor=table.flavor,
|
||||||
|
kind=kind
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_method = getattr(self, kind)
|
plot_method = getattr(self, kind)
|
||||||
|
|
@ -274,25 +267,21 @@ class PlotMethods():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
_FOR_LATTICE = table.flavor == "lattice"
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
ax = prepare_plot(table, ax)
|
||||||
|
|
||||||
if _FOR_LATTICE:
|
|
||||||
table_bbox = table._bbox_unscaled
|
|
||||||
else:
|
|
||||||
table_bbox = {table._bbox: None}
|
|
||||||
|
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
draw_text(table, ax)
|
draw_text(table, ax)
|
||||||
|
|
||||||
for t in table_bbox.keys():
|
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
(table._bbox[0], table._bbox[1]),
|
||||||
|
table._bbox[2] - table._bbox[0],
|
||||||
|
table._bbox[3] - table._bbox[1],
|
||||||
fill=False, color="red"
|
fill=False, color="red"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
extend_axe_lim(ax, t)
|
extend_axe_lim(ax, table._bbox)
|
||||||
|
|
||||||
return ax.get_figure()
|
return ax.get_figure()
|
||||||
|
|
||||||
|
|
@ -393,12 +382,10 @@ class PlotMethods():
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ax = prepare_plot(table, ax, to_pdf_scale=False)
|
ax = prepare_plot(table, ax)
|
||||||
table_bbox = table._bbox_unscaled
|
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
for k in table_bbox.keys():
|
for coord in table.parse["joints"]:
|
||||||
for coord in table_bbox[k]:
|
|
||||||
x_coord.append(coord[0])
|
x_coord.append(coord[0])
|
||||||
y_coord.append(coord[1])
|
y_coord.append(coord[1])
|
||||||
ax.plot(x_coord, y_coord, "ro")
|
ax.plot(x_coord, y_coord, "ro")
|
||||||
|
|
|
||||||
108
camelot/utils.py
108
camelot/utils.py
|
|
@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
j_x, j_y = zip(*tables[k])
|
j_x, j_y = zip(*tables[k])
|
||||||
j_x = [scale(j, scaling_factor_x) for j in j_x]
|
j_x = [scale(j, scaling_factor_x) for j in j_x]
|
||||||
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
|
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
|
||||||
joints = zip(j_x, j_y)
|
tables_new[(x1, y1, x2, y2)] = {
|
||||||
tables_new[(x1, y1, x2, y2)] = joints
|
"joints": list(zip(j_x, j_y))
|
||||||
|
}
|
||||||
|
|
||||||
v_segments_new = []
|
v_segments_new = []
|
||||||
for v in v_segments:
|
for v in v_segments:
|
||||||
|
|
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bboxes_overlap(bbox1, bbox2):
|
||||||
|
(left1, bottom1, right1, top1) = bbox1
|
||||||
|
(left2, bottom2, right2, top2) = bbox2
|
||||||
|
return (
|
||||||
|
(left1 < left2 < right1) or (left1 < right2 < right1)
|
||||||
|
) and (
|
||||||
|
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def textlines_overlapping_bbox(bbox, textlines):
|
def textlines_overlapping_bbox(bbox, textlines):
|
||||||
"""Returns all text objects which overlap or are within a bounding box.
|
"""Returns all text objects which overlap or are within a bounding box.
|
||||||
|
|
||||||
|
|
@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
(left, bottom, right, top) = bbox
|
|
||||||
t_bbox = [
|
t_bbox = [
|
||||||
t
|
t
|
||||||
for t in textlines
|
for t in textlines
|
||||||
if ((left < t.x0 < right) or (left < t.x1 < right))
|
if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
|
||||||
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
|
|
||||||
]
|
]
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
|
||||||
return bbox
|
return bbox
|
||||||
|
|
||||||
|
|
||||||
def find_columns_coordinates(tls, min_gap=1.0):
|
def find_columns_boundaries(tls, min_gap=1.0):
|
||||||
"""Given a list of text objects, guess columns boundaries and returns a
|
"""Make a list of disjunct cols boundaries for a list of text objects
|
||||||
list of x-coordinates for split points between columns.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
tls : list of PDFMiner text object.
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
min_gap : minimum distance between columns. Any elements closer than this
|
min_gap : minimum distance between columns. Any elements closer than
|
||||||
threshold are merged together. This is to prevent spaces between words
|
this threshold are merged together. This is to prevent spaces between
|
||||||
to be misinterpreted as column boundaries.
|
words to be misinterpreted as boundaries.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cols_anchors : list
|
boundaries : list
|
||||||
List of x-coordinates for columns.
|
List x-coordinates for cols.
|
||||||
|
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Make a list of disjunct cols boundaries across the textlines
|
|
||||||
# that comprise the table.
|
|
||||||
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
|
||||||
cols_bounds = []
|
cols_bounds = []
|
||||||
tls.sort(key=lambda tl: tl.x0)
|
tls.sort(key=lambda tl: tl.x0)
|
||||||
for tl in tls:
|
for tl in tls:
|
||||||
|
|
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
|
||||||
cols_bounds.append([tl.x0, tl.x1])
|
cols_bounds.append([tl.x0, tl.x1])
|
||||||
else:
|
else:
|
||||||
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||||
|
return cols_bounds
|
||||||
|
|
||||||
|
|
||||||
|
def find_rows_boundaries(tls, min_gap=1.0):
|
||||||
|
"""Make a list of disjunct rows boundaries for a list of text objects
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
|
min_gap : minimum distance between rows. Any elements closer than
|
||||||
|
this threshold are merged together.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
boundaries : list
|
||||||
|
List y-coordinates for rows.
|
||||||
|
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
|
||||||
|
|
||||||
|
"""
|
||||||
|
rows_bounds = []
|
||||||
|
tls.sort(key=lambda tl: tl.y0)
|
||||||
|
for tl in tls:
|
||||||
|
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
|
||||||
|
rows_bounds.append([tl.y0, tl.y1])
|
||||||
|
else:
|
||||||
|
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
|
||||||
|
return rows_bounds
|
||||||
|
|
||||||
|
|
||||||
|
def boundaries_to_split_lines(boundaries):
|
||||||
|
"""Find split lines given a list of boundaries between rows or cols.
|
||||||
|
|
||||||
|
Boundaries: [ a ] [b] [ c ] [d]
|
||||||
|
Splits: | | | | |
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
boundaries : list
|
||||||
|
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
|
||||||
|
These are the (left, right most) or (bottom, top most) coordinates.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
anchors : list
|
||||||
|
List of coordinates representing the split points, each half way
|
||||||
|
between boundaries
|
||||||
|
|
||||||
|
"""
|
||||||
# From the row boundaries, identify splits by getting the mid points
|
# From the row boundaries, identify splits by getting the mid points
|
||||||
# between the boundaries.
|
# between the boundaries.
|
||||||
# Row boundaries: [ a ] [b] [ c ]
|
anchors = list(map(
|
||||||
# Splits: | | | |
|
lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
|
||||||
cols_anchors = list(map(
|
range(1, len(boundaries))
|
||||||
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
|
|
||||||
range(1, len(cols_bounds))
|
|
||||||
))
|
))
|
||||||
cols_anchors.insert(0, cols_bounds[0][0])
|
anchors.insert(0, boundaries[0][0])
|
||||||
cols_anchors.append(cols_bounds[-1][1])
|
anchors.append(boundaries[-1][1])
|
||||||
return cols_anchors
|
return anchors
|
||||||
|
|
||||||
|
|
||||||
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
||||||
|
|
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def export_pdf_as_png(pdf_path, destination_path):
|
def export_pdf_as_png(pdf_path, destination_path, resolution=300):
|
||||||
"""Generate an image from a pdf.
|
"""Generate an image from a pdf.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
pdf_path : str
|
pdf_path : str
|
||||||
destination_path : str
|
destination_path : str
|
||||||
|
resolution : int
|
||||||
"""
|
"""
|
||||||
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
|
gs_call = "-q -sDEVICE=png16m -o " \
|
||||||
|
"{destination_path} -r{resolution} {pdf_path}" \
|
||||||
.format(
|
.format(
|
||||||
destination_path=destination_path,
|
destination_path=destination_path,
|
||||||
|
resolution=resolution,
|
||||||
pdf_path=pdf_path
|
pdf_path=pdf_path
|
||||||
)
|
)
|
||||||
gs_call = gs_call.encode().split()
|
gs_call = gs_call.encode().split()
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
496
tests/data.py
496
tests/data.py
|
|
@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Compared to network, hybrid detects additional sparse columns
|
||||||
|
data_hybrid_vertical_headers = [
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Congress-",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Senator 36th",
|
||||||
|
"",
|
||||||
|
"Rep106th",
|
||||||
|
"",
|
||||||
|
"Reg. of",
|
||||||
|
"",
|
||||||
|
"Road",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Distri",
|
||||||
|
"Dist",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Dist",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"1st Dist",
|
||||||
|
"Dist.",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Dist.",
|
||||||
|
"Deeds",
|
||||||
|
"",
|
||||||
|
"Commission",
|
||||||
|
"",
|
||||||
|
"District #1",
|
||||||
|
"",
|
||||||
|
"ct #2",
|
||||||
|
"#3",
|
||||||
|
"Dist #4",
|
||||||
|
"",
|
||||||
|
"#5",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"Governor",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"U.S. Senator",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"Number of Registered voters",
|
||||||
|
"Poll Book Totals",
|
||||||
|
"Brian Calley",
|
||||||
|
"Patrick Colbeck",
|
||||||
|
"Jim Hines",
|
||||||
|
"Bill Schuette",
|
||||||
|
"John James",
|
||||||
|
"Sandy Pensler",
|
||||||
|
"",
|
||||||
|
"Jack Bergman",
|
||||||
|
"",
|
||||||
|
"Jim Stamas",
|
||||||
|
"",
|
||||||
|
"Sue Allor",
|
||||||
|
"",
|
||||||
|
"Melissa A. Cordes",
|
||||||
|
"",
|
||||||
|
"Al Scully",
|
||||||
|
"",
|
||||||
|
"Daniel G. Gauthier",
|
||||||
|
"Craig M. Clemens",
|
||||||
|
"Craig Johnston",
|
||||||
|
"Carolyn Brummund",
|
||||||
|
"Adam Brege",
|
||||||
|
"David Bielusiak",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Alcona",
|
||||||
|
"963",
|
||||||
|
"439",
|
||||||
|
"55",
|
||||||
|
"26",
|
||||||
|
"47",
|
||||||
|
"164",
|
||||||
|
"173",
|
||||||
|
"111",
|
||||||
|
"",
|
||||||
|
"268",
|
||||||
|
"",
|
||||||
|
"272",
|
||||||
|
"",
|
||||||
|
"275",
|
||||||
|
"",
|
||||||
|
"269",
|
||||||
|
"",
|
||||||
|
"271",
|
||||||
|
"",
|
||||||
|
"224",
|
||||||
|
"76",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Caledonia",
|
||||||
|
"923",
|
||||||
|
"393",
|
||||||
|
"40",
|
||||||
|
"23",
|
||||||
|
"45",
|
||||||
|
"158",
|
||||||
|
"150",
|
||||||
|
"103",
|
||||||
|
"",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"247",
|
||||||
|
"",
|
||||||
|
"254",
|
||||||
|
"",
|
||||||
|
"255",
|
||||||
|
"",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"139",
|
||||||
|
"143",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Curtis",
|
||||||
|
"1026",
|
||||||
|
"349",
|
||||||
|
"30",
|
||||||
|
"30",
|
||||||
|
"25",
|
||||||
|
"102",
|
||||||
|
"95",
|
||||||
|
"84",
|
||||||
|
"",
|
||||||
|
"159",
|
||||||
|
"",
|
||||||
|
"164",
|
||||||
|
"",
|
||||||
|
"162",
|
||||||
|
"",
|
||||||
|
"161",
|
||||||
|
"",
|
||||||
|
"157",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Greenbush",
|
||||||
|
"1212",
|
||||||
|
"423",
|
||||||
|
"56",
|
||||||
|
"26",
|
||||||
|
"40",
|
||||||
|
"126",
|
||||||
|
"104",
|
||||||
|
"131",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"213",
|
||||||
|
"",
|
||||||
|
"214",
|
||||||
|
"",
|
||||||
|
"215",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"208",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Gustin",
|
||||||
|
"611",
|
||||||
|
"180",
|
||||||
|
"22",
|
||||||
|
"35",
|
||||||
|
"17",
|
||||||
|
"55",
|
||||||
|
"73",
|
||||||
|
"45",
|
||||||
|
"",
|
||||||
|
"108",
|
||||||
|
"",
|
||||||
|
"104",
|
||||||
|
"",
|
||||||
|
"111",
|
||||||
|
"",
|
||||||
|
"111",
|
||||||
|
"",
|
||||||
|
"109",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"81",
|
||||||
|
"42",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Harrisville",
|
||||||
|
"1142",
|
||||||
|
"430",
|
||||||
|
"45",
|
||||||
|
"90",
|
||||||
|
"29",
|
||||||
|
"101",
|
||||||
|
"155",
|
||||||
|
"94",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"",
|
||||||
|
"232",
|
||||||
|
"",
|
||||||
|
"244",
|
||||||
|
"",
|
||||||
|
"226",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"232",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Hawes",
|
||||||
|
"884",
|
||||||
|
"293",
|
||||||
|
"38",
|
||||||
|
"36",
|
||||||
|
"27",
|
||||||
|
"109",
|
||||||
|
"121",
|
||||||
|
"84",
|
||||||
|
"",
|
||||||
|
"192",
|
||||||
|
"",
|
||||||
|
"195",
|
||||||
|
"",
|
||||||
|
"195",
|
||||||
|
"",
|
||||||
|
"193",
|
||||||
|
"",
|
||||||
|
"184",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"118",
|
||||||
|
"87",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Haynes",
|
||||||
|
"626",
|
||||||
|
"275",
|
||||||
|
"31",
|
||||||
|
"20",
|
||||||
|
"32",
|
||||||
|
"104",
|
||||||
|
"121",
|
||||||
|
"53",
|
||||||
|
"",
|
||||||
|
"163",
|
||||||
|
"",
|
||||||
|
"163",
|
||||||
|
"",
|
||||||
|
"173",
|
||||||
|
"",
|
||||||
|
"161",
|
||||||
|
"",
|
||||||
|
"152",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"76",
|
||||||
|
"",
|
||||||
|
"69",
|
||||||
|
"31",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Mikado",
|
||||||
|
"781",
|
||||||
|
"208",
|
||||||
|
"19",
|
||||||
|
"39",
|
||||||
|
"17",
|
||||||
|
"81",
|
||||||
|
"90",
|
||||||
|
"63",
|
||||||
|
"",
|
||||||
|
"149",
|
||||||
|
"",
|
||||||
|
"149",
|
||||||
|
"",
|
||||||
|
"145",
|
||||||
|
"",
|
||||||
|
"147",
|
||||||
|
"",
|
||||||
|
"143",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"113",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Millen",
|
||||||
|
"353",
|
||||||
|
"139",
|
||||||
|
"7",
|
||||||
|
"16",
|
||||||
|
"13",
|
||||||
|
"38",
|
||||||
|
"49",
|
||||||
|
"19",
|
||||||
|
"",
|
||||||
|
"62",
|
||||||
|
"",
|
||||||
|
"66",
|
||||||
|
"",
|
||||||
|
"67",
|
||||||
|
"",
|
||||||
|
"66",
|
||||||
|
"",
|
||||||
|
"62",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Mitchell",
|
||||||
|
"327",
|
||||||
|
"96",
|
||||||
|
"12",
|
||||||
|
"17",
|
||||||
|
"7",
|
||||||
|
"29",
|
||||||
|
"41",
|
||||||
|
"17",
|
||||||
|
"",
|
||||||
|
"57",
|
||||||
|
"",
|
||||||
|
"55",
|
||||||
|
"",
|
||||||
|
"57",
|
||||||
|
"",
|
||||||
|
"60",
|
||||||
|
"",
|
||||||
|
"56",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"City Harrisville",
|
||||||
|
"389",
|
||||||
|
"171",
|
||||||
|
"16",
|
||||||
|
"15",
|
||||||
|
"18",
|
||||||
|
"35",
|
||||||
|
"49",
|
||||||
|
"31",
|
||||||
|
"",
|
||||||
|
"78",
|
||||||
|
"",
|
||||||
|
"80",
|
||||||
|
"",
|
||||||
|
"82",
|
||||||
|
"",
|
||||||
|
"81",
|
||||||
|
"",
|
||||||
|
"77",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"73",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"Totals",
|
||||||
|
"9237",
|
||||||
|
"3396",
|
||||||
|
"371",
|
||||||
|
"373",
|
||||||
|
"317",
|
||||||
|
"1102",
|
||||||
|
"1221",
|
||||||
|
"835",
|
||||||
|
"0",
|
||||||
|
"1914",
|
||||||
|
"0",
|
||||||
|
"1934",
|
||||||
|
"",
|
||||||
|
"1967",
|
||||||
|
"",
|
||||||
|
"1963",
|
||||||
|
"0",
|
||||||
|
"1889",
|
||||||
|
"0",
|
||||||
|
"363",
|
||||||
|
"219",
|
||||||
|
"381",
|
||||||
|
"321",
|
||||||
|
"268",
|
||||||
|
"160",
|
||||||
|
"0",
|
||||||
|
],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_table_areas = [
|
data_stream_table_areas = [
|
||||||
|
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 46 KiB |
|
|
@ -285,6 +285,19 @@ def test_network_layout_kwargs():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Hybrid parser
|
||||||
|
def test_hybrid_vertical_header():
|
||||||
|
"""Tests a complex table with a vertically text header.
|
||||||
|
"""
|
||||||
|
df = pd.DataFrame(data_hybrid_vertical_headers)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
assert len(tables) == 1
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Lattice parser tests
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue