Enable process_background option for hybrid

Trim empty cols and lines
pull/153/head
Frh 2020-05-08 15:08:12 -07:00
parent ae429fc248
commit ba5169b33d
7 changed files with 109 additions and 38 deletions

View File

@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
"-flag", "-flag",
"--flag_size", "--flag_size",
is_flag=True, is_flag=True,
help="Flag text based on" " font size. Useful to detect super/subscripts.", help="Flag text based on font size. Useful to detect super/subscripts.",
) )
@click.option( @click.option(
"-strip", "-strip",

View File

@ -131,13 +131,15 @@ class BaseParser():
return True return True
return False return False
def _initialize_new_table(self, table_idx, cols, rows): def _initialize_new_table(self, table_idx, bbox, cols, rows):
"""Initialize new table object, ready to be populated """Initialize new table object, ready to be populated
Parameters Parameters
---------- ----------
table_idx : int table_idx : int
Index of this table within the pdf page analyzed Index of this table within the pdf page analyzed
bbox : set
bounding box of this table within the pdf page analyzed
cols : list cols : list
list of coordinate boundaries tuples (left, right) list of coordinate boundaries tuples (left, right)
rows : list rows : list
@ -151,7 +153,7 @@ class BaseParser():
table = Table(cols, rows) table = Table(cols, rows)
table.page = self.page table.page = self.page
table.order = table_idx + 1 table.order = table_idx + 1
table._bbox = self.table_bboxes()[table_idx] table._bbox = bbox
return table return table
@staticmethod @staticmethod
@ -191,7 +193,7 @@ class BaseParser():
# Pure virtual, must be defined by the derived parser # Pure virtual, must be defined by the derived parser
raise NotImplementedError() raise NotImplementedError()
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
# Pure virtual, must be defined by the derived parser # Pure virtual, must be defined by the derived parser
raise NotImplementedError() raise NotImplementedError()
@ -225,7 +227,7 @@ class BaseParser():
user_cols user_cols
) )
table = self._generate_table( table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s) table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table) _tables.append(table)
return _tables return _tables
@ -467,8 +469,8 @@ class TextBaseParser(BaseParser):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns"
" should be equal") " should be equal")
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows) table = self._initialize_new_table(table_idx, bbox, cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
self.record_parse_metadata(table) self.record_parse_metadata(table)

View File

@ -104,10 +104,15 @@ class Hybrid(BaseParser):
parser = self.table_bbox_parses[bbox] parser = self.table_bbox_parses[bbox]
return parser._generate_columns_and_rows(bbox, table_idx) return parser._generate_columns_and_rows(bbox, table_idx)
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
bbox = self.table_bboxes()[table_idx]
parser = self.table_bbox_parses[bbox] parser = self.table_bbox_parses[bbox]
return parser._generate_table(table_idx, cols, rows, **kwargs) table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
# Because hybrid can inject extraneous splits from both lattice and
# network, remove lines / cols that are completely empty.
df = table.df
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
return table
@staticmethod @staticmethod
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0): def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
@ -223,6 +228,3 @@ class Hybrid(BaseParser):
# Add the bboxes from network that haven't been merged # Add the bboxes from network that haven't been merged
for network_bbox in _network_bboxes: for network_bbox in _network_bboxes:
self.table_bbox_parses[network_bbox] = self.network_parser self.table_bbox_parses[network_bbox] = self.network_parser
def record_parse_metadata(self, table):
super().record_parse_metadata(table)

View File

@ -322,13 +322,13 @@ class Lattice(BaseParser):
] ]
return cols, rows, v_s, h_s return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
v_s = kwargs.get("v_s") v_s = kwargs.get("v_s")
h_s = kwargs.get("h_s") h_s = kwargs.get("h_s")
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname)) raise ValueError("No segments found on {}".format(self.rootname))
table = self._initialize_new_table(table_idx, cols, rows) table = self._initialize_new_table(table_idx, bbox, cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True

View File

@ -110,9 +110,21 @@ def download_url(url):
shutil.move(f.name, filepath) shutil.move(f.name, filepath)
return filepath return filepath
common_kwargs = [
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] "flag_size",
lattice_kwargs = [ "margins",
"split_text",
"strip_text",
"table_areas",
"table_regions"
]
text_kwargs = common_kwargs + [
"columns",
"edge_tol",
"row_tol",
"column_tol"
]
lattice_kwargs = common_kwargs+ [
"process_background", "process_background",
"line_scale", "line_scale",
"copy_text", "copy_text",
@ -124,11 +136,18 @@ lattice_kwargs = [
"iterations", "iterations",
"resolution", "resolution",
] ]
flavor_to_kwargs = {
"stream": text_kwargs,
"network": text_kwargs,
"lattice": lattice_kwargs,
"hybrid": text_kwargs + lattice_kwargs,
}
def validate_input(kwargs, flavor="lattice"): def validate_input(kwargs, flavor="lattice"):
def check_intersection(parser_kwargs, input_kwargs): parser_kwargs = flavor_to_kwargs[flavor]
isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) # s.difference(t): new set with elements in s but not in t
isec = set(kwargs.keys()).difference(set(parser_kwargs))
if isec: if isec:
raise ValueError( raise ValueError(
"{} cannot be used with flavor='{}'".format( "{} cannot be used with flavor='{}'".format(
@ -136,20 +155,13 @@ def validate_input(kwargs, flavor="lattice"):
) )
) )
if flavor == "lattice":
check_intersection(stream_kwargs, kwargs)
else:
check_intersection(lattice_kwargs, kwargs)
def remove_extra(kwargs, flavor="lattice"): def remove_extra(kwargs, flavor="lattice"):
if flavor == "lattice": parser_kwargs = flavor_to_kwargs[flavor]
for key in kwargs.keys(): # Avoid "dictionary changed size during iteration"
if key in stream_kwargs: kwargs_keys = list(kwargs.keys())
kwargs.pop(key) for key in kwargs_keys:
else: if key not in parser_kwargs:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key) kwargs.pop(key)
return kwargs return kwargs

View File

@ -3701,6 +3701,52 @@ data_lattice_process_background = [
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
] ]
data_hybrid_process_background = [
[
"State",
"Date",
"Halt",
"Halt",
"Persons",
"Persons",
"Persons",
"Persons",
],
["", "", "stations", "days", "directly", "trained", "counseled", "tested"],
["", "", "", "", "reached", "", "", "for HIV"],
["", "", "", "", "(in lakh)", "", "", ""],
["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""],
["", "19.12.2009", "", "", "", "", "", ""],
["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"],
["", "3.1.2010", "", "", "", "", "", ""],
[
"Maharashtra",
"4.01.2010 to",
"13",
"26",
"1.27",
"5,680",
"9,027",
"4,153",
],
["", "1.2.2010", "", "", "", "", "", ""],
[
"Karnataka",
"2.2.2010 to",
"11",
"19",
"1.80",
"5,741",
"3,658",
"3,183",
],
["", "22.2.2010", "", "", "", "", "", ""],
["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"],
["", "11.3.2010", "", "", "", "", "", ""],
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
]
data_lattice_copy_text = [ data_lattice_copy_text = [
["Plan Type", "County", "Plan Name", "Totals"], ["Plan Type", "County", "Plan Name", "Totals"],
["GMC", "Sacramento", "Anthem Blue Cross", "164,380"], ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],

View File

@ -315,6 +315,15 @@ def test_hybrid_vertical_header():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
def test_hybrid_process_background():
df = pd.DataFrame(data_hybrid_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", process_background=True)
assert_frame_equal(df, tables[1].df)
# Lattice parser tests # Lattice parser tests
def test_lattice(): def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)