Fix issues following pass across most test cases

* Clean up the parser comparison notebook
* Address issue where hybrid didn't honor the columns parameter
* Fix dropping of empty rows/columns in hybrid
* Hybrid learns table y-dimensions from lattice
pull/153/head
Frh 2020-06-16 13:04:53 -07:00
parent 9c971a18f0
commit 71805f9333
5 changed files with 110 additions and 129 deletions

View File

@ -5,6 +5,7 @@ from ..utils import (
boundaries_to_split_lines,
)
import numpy as np
from .base import BaseParser
from .network import Network
from .lattice import Lattice
@ -67,6 +68,7 @@ class Hybrid(BaseParser):
strip_text=strip_text,
debug=debug,
)
self.columns = columns # Columns settings impacts the hybrid table
self.network_parser = Network(
table_regions=table_regions,
table_areas=table_areas,
@ -109,9 +111,11 @@ class Hybrid(BaseParser):
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
# Because hybrid can inject extraneous splits from both lattice and
# network, remove lines / cols that are completely empty.
df = table.df
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
table.df = table.df.replace('', np.nan)
table.df = table.df.dropna(axis=0, how="all")
table.df = table.df.dropna(axis=1, how="all")
table.df = table.df.replace(np.nan, '')
table.shape = table.df.shape
return table
@staticmethod
@ -172,13 +176,12 @@ class Hybrid(BaseParser):
""" Identify splits that were only detected by lattice or by network
"""
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
lattice_cols, lattice_rows = \
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
lattice_cols = lattice_parse["col_anchors"]
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
network_cols_boundaries = network_bbox_data["cols_boundaries"]
# Favor hybrid, but complete or adjust its columns based on the
# Favor network, but complete or adjust its columns based on the
# splits identified by lattice.
if network_cols_boundaries is None:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
@ -188,8 +191,10 @@ class Hybrid(BaseParser):
lattice_cols,
self.lattice_parser.joint_tol)
augmented_bbox = (
network_cols_boundaries[0][0], network_bbox[1],
network_cols_boundaries[-1][1], network_bbox[3],
network_cols_boundaries[0][0],
min(lattice_bbox[1], network_bbox[1]),
network_cols_boundaries[-1][1],
max(lattice_bbox[3], network_bbox[3]),
)
network_bbox_data["cols_anchors"] = \
boundaries_to_split_lines(network_cols_boundaries)

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,11 @@
# -*- coding: utf-8 -*-
data_stream = [
data_hybrid = [
[
"", "Table: 5 Public Health Outlay 2012-13 (Budget"
" Estimates) (Rs. in 000)", "", "", "", "", "", ""
],
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
["", "", "", "", "", "Revenue &", "", ""],
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
@ -224,6 +228,10 @@ data_stream = [
],
]
# Hybrid includes the header because the boundaries of the table include it,
# but stream/network don't include it.
data_stream = data_hybrid[1:]
data_stream_table_rotated = [
[
"Table 21 Current use of contraception by background characteristics"
@ -2074,6 +2082,11 @@ data_network_vertical_headers = [
# Compared to network, hybrid detects additional sparse columns
data_hybrid_vertical_headers = [
[
"", "", "", "", "", "STATE", "", "", "", "CONGRESSIONAL", "", "",
"", "", "LEGISLATIVE", "", "", "COUNTY", "", "COUNTY", "", "",
"County Commissioner", "", "", "", ""
],
[
"",
"",

Binary file not shown.

View File

@ -287,7 +287,7 @@ def test_network_layout_kwargs():
# Hybrid parser
def test_hybrid():
df = pd.DataFrame(data_stream)
df = pd.DataFrame(data_hybrid)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
@ -324,6 +324,19 @@ def test_hybrid_process_background():
assert_frame_equal(df, tables[1].df)
def test_hybrid_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="hybrid",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
# Lattice parser tests
def test_lattice():
df = pd.DataFrame(data_lattice)