Fix issues following pass across most test cases
* Clean up the parser comparison notebook * Address issue where hybrid didn't honor the columns parameter * Fix dropping of empty rows/columns in hybrid * Hybrid learns table y-dimensions from latticepull/153/head
parent
9c971a18f0
commit
71805f9333
|
|
@ -5,6 +5,7 @@ from ..utils import (
|
||||||
boundaries_to_split_lines,
|
boundaries_to_split_lines,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from .network import Network
|
from .network import Network
|
||||||
from .lattice import Lattice
|
from .lattice import Lattice
|
||||||
|
|
@ -67,6 +68,7 @@ class Hybrid(BaseParser):
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
debug=debug,
|
debug=debug,
|
||||||
)
|
)
|
||||||
|
self.columns = columns # Columns settings impacts the hybrid table
|
||||||
self.network_parser = Network(
|
self.network_parser = Network(
|
||||||
table_regions=table_regions,
|
table_regions=table_regions,
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
|
|
@ -109,9 +111,11 @@ class Hybrid(BaseParser):
|
||||||
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
||||||
# Because hybrid can inject extraneous splits from both lattice and
|
# Because hybrid can inject extraneous splits from both lattice and
|
||||||
# network, remove lines / cols that are completely empty.
|
# network, remove lines / cols that are completely empty.
|
||||||
df = table.df
|
table.df = table.df.replace('', np.nan)
|
||||||
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
|
table.df = table.df.dropna(axis=0, how="all")
|
||||||
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
|
table.df = table.df.dropna(axis=1, how="all")
|
||||||
|
table.df = table.df.replace(np.nan, '')
|
||||||
|
table.shape = table.df.shape
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -172,13 +176,12 @@ class Hybrid(BaseParser):
|
||||||
""" Identify splits that were only detected by lattice or by network
|
""" Identify splits that were only detected by lattice or by network
|
||||||
"""
|
"""
|
||||||
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
||||||
lattice_cols, lattice_rows = \
|
lattice_cols = lattice_parse["col_anchors"]
|
||||||
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
|
|
||||||
|
|
||||||
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
||||||
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
||||||
|
|
||||||
# Favor hybrid, but complete or adjust its columns based on the
|
# Favor network, but complete or adjust its columns based on the
|
||||||
# splits identified by lattice.
|
# splits identified by lattice.
|
||||||
if network_cols_boundaries is None:
|
if network_cols_boundaries is None:
|
||||||
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||||
|
|
@ -188,8 +191,10 @@ class Hybrid(BaseParser):
|
||||||
lattice_cols,
|
lattice_cols,
|
||||||
self.lattice_parser.joint_tol)
|
self.lattice_parser.joint_tol)
|
||||||
augmented_bbox = (
|
augmented_bbox = (
|
||||||
network_cols_boundaries[0][0], network_bbox[1],
|
network_cols_boundaries[0][0],
|
||||||
network_cols_boundaries[-1][1], network_bbox[3],
|
min(lattice_bbox[1], network_bbox[1]),
|
||||||
|
network_cols_boundaries[-1][1],
|
||||||
|
max(lattice_bbox[3], network_bbox[3]),
|
||||||
)
|
)
|
||||||
network_bbox_data["cols_anchors"] = \
|
network_bbox_data["cols_anchors"] = \
|
||||||
boundaries_to_split_lines(network_cols_boundaries)
|
boundaries_to_split_lines(network_cols_boundaries)
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -1,7 +1,11 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
data_stream = [
|
data_hybrid = [
|
||||||
|
[
|
||||||
|
"", "Table: 5 Public Health Outlay 2012-13 (Budget"
|
||||||
|
" Estimates) (Rs. in 000)", "", "", "", "", "", ""
|
||||||
|
],
|
||||||
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
||||||
["", "", "", "", "", "Revenue &", "", ""],
|
["", "", "", "", "", "Revenue &", "", ""],
|
||||||
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
||||||
|
|
@ -224,6 +228,10 @@ data_stream = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Hybrid includes the header because the boundaries of the table include it,
|
||||||
|
# but stream/network don't include it.
|
||||||
|
data_stream = data_hybrid[1:]
|
||||||
|
|
||||||
data_stream_table_rotated = [
|
data_stream_table_rotated = [
|
||||||
[
|
[
|
||||||
"Table 21 Current use of contraception by background characteristics"
|
"Table 21 Current use of contraception by background characteristics"
|
||||||
|
|
@ -2074,6 +2082,11 @@ data_network_vertical_headers = [
|
||||||
|
|
||||||
# Compared to network, hybrid detects additional sparse columns
|
# Compared to network, hybrid detects additional sparse columns
|
||||||
data_hybrid_vertical_headers = [
|
data_hybrid_vertical_headers = [
|
||||||
|
[
|
||||||
|
"", "", "", "", "", "STATE", "", "", "", "CONGRESSIONAL", "", "",
|
||||||
|
"", "", "LEGISLATIVE", "", "", "COUNTY", "", "COUNTY", "", "",
|
||||||
|
"County Commissioner", "", "", "", ""
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -287,7 +287,7 @@ def test_network_layout_kwargs():
|
||||||
|
|
||||||
# Hybrid parser
|
# Hybrid parser
|
||||||
def test_hybrid():
|
def test_hybrid():
|
||||||
df = pd.DataFrame(data_stream)
|
df = pd.DataFrame(data_hybrid)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health.pdf")
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
|
@ -324,6 +324,19 @@ def test_hybrid_process_background():
|
||||||
assert_frame_equal(df, tables[1].df)
|
assert_frame_equal(df, tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hybrid_split_text():
|
||||||
|
df = pd.DataFrame(data_network_split_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
flavor="hybrid",
|
||||||
|
columns=["72,95,209,327,442,529,566,606,683"],
|
||||||
|
split_text=True,
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
# Lattice parser tests
|
# Lattice parser tests
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue