Fix issues following pass across most test cases
* Clean up the parser comparison notebook * Address issue where hybrid didn't honor the columns parameter * Fix dropping of empty rows/columns in hybrid * Hybrid learns table y-dimensions from latticepull/153/head
parent
9c971a18f0
commit
71805f9333
|
|
@ -5,6 +5,7 @@ from ..utils import (
|
|||
boundaries_to_split_lines,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from .base import BaseParser
|
||||
from .network import Network
|
||||
from .lattice import Lattice
|
||||
|
|
@ -67,6 +68,7 @@ class Hybrid(BaseParser):
|
|||
strip_text=strip_text,
|
||||
debug=debug,
|
||||
)
|
||||
self.columns = columns # Columns settings impacts the hybrid table
|
||||
self.network_parser = Network(
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
|
|
@ -109,9 +111,11 @@ class Hybrid(BaseParser):
|
|||
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
||||
# Because hybrid can inject extraneous splits from both lattice and
|
||||
# network, remove lines / cols that are completely empty.
|
||||
df = table.df
|
||||
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
|
||||
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
|
||||
table.df = table.df.replace('', np.nan)
|
||||
table.df = table.df.dropna(axis=0, how="all")
|
||||
table.df = table.df.dropna(axis=1, how="all")
|
||||
table.df = table.df.replace(np.nan, '')
|
||||
table.shape = table.df.shape
|
||||
return table
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -172,13 +176,12 @@ class Hybrid(BaseParser):
|
|||
""" Identify splits that were only detected by lattice or by network
|
||||
"""
|
||||
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
||||
lattice_cols, lattice_rows = \
|
||||
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
|
||||
lattice_cols = lattice_parse["col_anchors"]
|
||||
|
||||
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
||||
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
||||
|
||||
# Favor hybrid, but complete or adjust its columns based on the
|
||||
# Favor network, but complete or adjust its columns based on the
|
||||
# splits identified by lattice.
|
||||
if network_cols_boundaries is None:
|
||||
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||
|
|
@ -188,8 +191,10 @@ class Hybrid(BaseParser):
|
|||
lattice_cols,
|
||||
self.lattice_parser.joint_tol)
|
||||
augmented_bbox = (
|
||||
network_cols_boundaries[0][0], network_bbox[1],
|
||||
network_cols_boundaries[-1][1], network_bbox[3],
|
||||
network_cols_boundaries[0][0],
|
||||
min(lattice_bbox[1], network_bbox[1]),
|
||||
network_cols_boundaries[-1][1],
|
||||
max(lattice_bbox[3], network_bbox[3]),
|
||||
)
|
||||
network_bbox_data["cols_anchors"] = \
|
||||
boundaries_to_split_lines(network_cols_boundaries)
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -1,7 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
data_stream = [
|
||||
data_hybrid = [
|
||||
[
|
||||
"", "Table: 5 Public Health Outlay 2012-13 (Budget"
|
||||
" Estimates) (Rs. in 000)", "", "", "", "", "", ""
|
||||
],
|
||||
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
||||
["", "", "", "", "", "Revenue &", "", ""],
|
||||
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
||||
|
|
@ -224,6 +228,10 @@ data_stream = [
|
|||
],
|
||||
]
|
||||
|
||||
# Hybrid includes the header because the boundaries of the table include it,
|
||||
# but stream/network don't include it.
|
||||
data_stream = data_hybrid[1:]
|
||||
|
||||
data_stream_table_rotated = [
|
||||
[
|
||||
"Table 21 Current use of contraception by background characteristics"
|
||||
|
|
@ -2074,6 +2082,11 @@ data_network_vertical_headers = [
|
|||
|
||||
# Compared to network, hybrid detects additional sparse columns
|
||||
data_hybrid_vertical_headers = [
|
||||
[
|
||||
"", "", "", "", "", "STATE", "", "", "", "CONGRESSIONAL", "", "",
|
||||
"", "", "LEGISLATIVE", "", "", "COUNTY", "", "COUNTY", "", "",
|
||||
"County Commissioner", "", "", "", ""
|
||||
],
|
||||
[
|
||||
"",
|
||||
"",
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -287,7 +287,7 @@ def test_network_layout_kwargs():
|
|||
|
||||
# Hybrid parser
|
||||
def test_hybrid():
|
||||
df = pd.DataFrame(data_stream)
|
||||
df = pd.DataFrame(data_hybrid)
|
||||
|
||||
filename = os.path.join(testdir, "health.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
|
|
@ -324,6 +324,19 @@ def test_hybrid_process_background():
|
|||
assert_frame_equal(df, tables[1].df)
|
||||
|
||||
|
||||
def test_hybrid_split_text():
|
||||
df = pd.DataFrame(data_network_split_text)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename,
|
||||
flavor="hybrid",
|
||||
columns=["72,95,209,327,442,529,566,606,683"],
|
||||
split_text=True,
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
# Lattice parser tests
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
|
|
|||
Loading…
Reference in New Issue