parent
63adfd5468
commit
9abdd00cec
|
|
@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
"-flag",
|
"-flag",
|
||||||
"--flag_size",
|
"--flag_size",
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="Flag text based on" " font size. Useful to detect super/subscripts.",
|
help="Flag text based on font size. Useful to detect super/subscripts.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-strip",
|
"-strip",
|
||||||
|
|
|
||||||
|
|
@ -131,13 +131,15 @@ class BaseParser():
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _initialize_new_table(self, table_idx, cols, rows):
|
def _initialize_new_table(self, table_idx, bbox, cols, rows):
|
||||||
"""Initialize new table object, ready to be populated
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_idx : int
|
table_idx : int
|
||||||
Index of this table within the pdf page analyzed
|
Index of this table within the pdf page analyzed
|
||||||
|
bbox : set
|
||||||
|
bounding box of this table within the pdf page analyzed
|
||||||
cols : list
|
cols : list
|
||||||
list of coordinate boundaries tuples (left, right)
|
list of coordinate boundaries tuples (left, right)
|
||||||
rows : list
|
rows : list
|
||||||
|
|
@ -151,7 +153,7 @@ class BaseParser():
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table.page = self.page
|
table.page = self.page
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table._bbox = self.table_bboxes()[table_idx]
|
table._bbox = bbox
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -191,7 +193,7 @@ class BaseParser():
|
||||||
# Pure virtual, must be defined by the derived parser
|
# Pure virtual, must be defined by the derived parser
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
# Pure virtual, must be defined by the derived parser
|
# Pure virtual, must be defined by the derived parser
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
@ -225,7 +227,7 @@ class BaseParser():
|
||||||
user_cols
|
user_cols
|
||||||
)
|
)
|
||||||
table = self._generate_table(
|
table = self._generate_table(
|
||||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
return _tables
|
return _tables
|
||||||
|
|
@ -467,8 +469,8 @@ class TextBaseParser(BaseParser):
|
||||||
raise ValueError("Length of table_areas and columns"
|
raise ValueError("Length of table_areas and columns"
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
self.record_parse_metadata(table)
|
self.record_parse_metadata(table)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -104,10 +104,15 @@ class Hybrid(BaseParser):
|
||||||
parser = self.table_bbox_parses[bbox]
|
parser = self.table_bbox_parses[bbox]
|
||||||
return parser._generate_columns_and_rows(bbox, table_idx)
|
return parser._generate_columns_and_rows(bbox, table_idx)
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
bbox = self.table_bboxes()[table_idx]
|
|
||||||
parser = self.table_bbox_parses[bbox]
|
parser = self.table_bbox_parses[bbox]
|
||||||
return parser._generate_table(table_idx, cols, rows, **kwargs)
|
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
||||||
|
# Because hybrid can inject extraneous splits from both lattice and
|
||||||
|
# network, remove lines / cols that are completely empty.
|
||||||
|
df = table.df
|
||||||
|
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
|
||||||
|
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
|
||||||
|
return table
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
||||||
|
|
@ -223,6 +228,3 @@ class Hybrid(BaseParser):
|
||||||
# Add the bboxes from network that haven't been merged
|
# Add the bboxes from network that haven't been merged
|
||||||
for network_bbox in _network_bboxes:
|
for network_bbox in _network_bboxes:
|
||||||
self.table_bbox_parses[network_bbox] = self.network_parser
|
self.table_bbox_parses[network_bbox] = self.network_parser
|
||||||
|
|
||||||
def record_parse_metadata(self, table):
|
|
||||||
super().record_parse_metadata(table)
|
|
||||||
|
|
|
||||||
|
|
@ -322,13 +322,13 @@ class Lattice(BaseParser):
|
||||||
]
|
]
|
||||||
return cols, rows, v_s, h_s
|
return cols, rows, v_s, h_s
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||||
v_s = kwargs.get("v_s")
|
v_s = kwargs.get("v_s")
|
||||||
h_s = kwargs.get("h_s")
|
h_s = kwargs.get("h_s")
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError("No segments found on {}".format(self.rootname))
|
raise ValueError("No segments found on {}".format(self.rootname))
|
||||||
|
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
|
|
|
||||||
|
|
@ -110,9 +110,21 @@ def download_url(url):
|
||||||
shutil.move(f.name, filepath)
|
shutil.move(f.name, filepath)
|
||||||
return filepath
|
return filepath
|
||||||
|
|
||||||
|
common_kwargs = [
|
||||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
"flag_size",
|
||||||
lattice_kwargs = [
|
"margins",
|
||||||
|
"split_text",
|
||||||
|
"strip_text",
|
||||||
|
"table_areas",
|
||||||
|
"table_regions"
|
||||||
|
]
|
||||||
|
text_kwargs = common_kwargs + [
|
||||||
|
"columns",
|
||||||
|
"edge_tol",
|
||||||
|
"row_tol",
|
||||||
|
"column_tol"
|
||||||
|
]
|
||||||
|
lattice_kwargs = common_kwargs+ [
|
||||||
"process_background",
|
"process_background",
|
||||||
"line_scale",
|
"line_scale",
|
||||||
"copy_text",
|
"copy_text",
|
||||||
|
|
@ -124,33 +136,33 @@ lattice_kwargs = [
|
||||||
"iterations",
|
"iterations",
|
||||||
"resolution",
|
"resolution",
|
||||||
]
|
]
|
||||||
|
flavor_to_kwargs = {
|
||||||
|
"stream": text_kwargs,
|
||||||
|
"network": text_kwargs,
|
||||||
|
"lattice": lattice_kwargs,
|
||||||
|
"hybrid": text_kwargs + lattice_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, flavor="lattice"):
|
def validate_input(kwargs, flavor="lattice"):
|
||||||
def check_intersection(parser_kwargs, input_kwargs):
|
parser_kwargs = flavor_to_kwargs[flavor]
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
# s.difference(t): new set with elements in s but not in t
|
||||||
if isec:
|
isec = set(kwargs.keys()).difference(set(parser_kwargs))
|
||||||
raise ValueError(
|
if isec:
|
||||||
"{} cannot be used with flavor='{}'".format(
|
raise ValueError(
|
||||||
",".join(sorted(isec)), flavor
|
"{} cannot be used with flavor='{}'".format(
|
||||||
)
|
",".join(sorted(isec)), flavor
|
||||||
)
|
)
|
||||||
|
)
|
||||||
if flavor == "lattice":
|
|
||||||
check_intersection(stream_kwargs, kwargs)
|
|
||||||
else:
|
|
||||||
check_intersection(lattice_kwargs, kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, flavor="lattice"):
|
def remove_extra(kwargs, flavor="lattice"):
|
||||||
if flavor == "lattice":
|
parser_kwargs = flavor_to_kwargs[flavor]
|
||||||
for key in kwargs.keys():
|
# Avoid "dictionary changed size during iteration"
|
||||||
if key in stream_kwargs:
|
kwargs_keys = list(kwargs.keys())
|
||||||
kwargs.pop(key)
|
for key in kwargs_keys:
|
||||||
else:
|
if key not in parser_kwargs:
|
||||||
for key in kwargs.keys():
|
kwargs.pop(key)
|
||||||
if key in lattice_kwargs:
|
|
||||||
kwargs.pop(key)
|
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3701,6 +3701,52 @@ data_lattice_process_background = [
|
||||||
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
|
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_hybrid_process_background = [
|
||||||
|
[
|
||||||
|
"State",
|
||||||
|
"Date",
|
||||||
|
"Halt",
|
||||||
|
"Halt",
|
||||||
|
"Persons",
|
||||||
|
"Persons",
|
||||||
|
"Persons",
|
||||||
|
"Persons",
|
||||||
|
],
|
||||||
|
["", "", "stations", "days", "directly", "trained", "counseled", "tested"],
|
||||||
|
["", "", "", "", "reached", "", "", "for HIV"],
|
||||||
|
["", "", "", "", "(in lakh)", "", "", ""],
|
||||||
|
["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
|
||||||
|
["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""],
|
||||||
|
["", "19.12.2009", "", "", "", "", "", ""],
|
||||||
|
["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"],
|
||||||
|
["", "3.1.2010", "", "", "", "", "", ""],
|
||||||
|
[
|
||||||
|
"Maharashtra",
|
||||||
|
"4.01.2010 to",
|
||||||
|
"13",
|
||||||
|
"26",
|
||||||
|
"1.27",
|
||||||
|
"5,680",
|
||||||
|
"9,027",
|
||||||
|
"4,153",
|
||||||
|
],
|
||||||
|
["", "1.2.2010", "", "", "", "", "", ""],
|
||||||
|
[
|
||||||
|
"Karnataka",
|
||||||
|
"2.2.2010 to",
|
||||||
|
"11",
|
||||||
|
"19",
|
||||||
|
"1.80",
|
||||||
|
"5,741",
|
||||||
|
"3,658",
|
||||||
|
"3,183",
|
||||||
|
],
|
||||||
|
["", "22.2.2010", "", "", "", "", "", ""],
|
||||||
|
["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"],
|
||||||
|
["", "11.3.2010", "", "", "", "", "", ""],
|
||||||
|
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
|
||||||
|
]
|
||||||
|
|
||||||
data_lattice_copy_text = [
|
data_lattice_copy_text = [
|
||||||
["Plan Type", "County", "Plan Name", "Totals"],
|
["Plan Type", "County", "Plan Name", "Totals"],
|
||||||
["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],
|
["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],
|
||||||
|
|
|
||||||
|
|
@ -321,6 +321,15 @@ def test_hybrid_vertical_header():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hybrid_process_background():
|
||||||
|
df = pd.DataFrame(data_hybrid_process_background)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="hybrid", process_background=True)
|
||||||
|
assert_frame_equal(df, tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
# Lattice parser tests
|
# Lattice parser tests
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue