parent
63adfd5468
commit
9abdd00cec
|
|
@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
|
|||
"-flag",
|
||||
"--flag_size",
|
||||
is_flag=True,
|
||||
help="Flag text based on" " font size. Useful to detect super/subscripts.",
|
||||
help="Flag text based on font size. Useful to detect super/subscripts.",
|
||||
)
|
||||
@click.option(
|
||||
"-strip",
|
||||
|
|
|
|||
|
|
@ -131,13 +131,15 @@ class BaseParser():
|
|||
return True
|
||||
return False
|
||||
|
||||
def _initialize_new_table(self, table_idx, cols, rows):
|
||||
def _initialize_new_table(self, table_idx, bbox, cols, rows):
|
||||
"""Initialize new table object, ready to be populated
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_idx : int
|
||||
Index of this table within the pdf page analyzed
|
||||
bbox : set
|
||||
bounding box of this table within the pdf page analyzed
|
||||
cols : list
|
||||
list of coordinate boundaries tuples (left, right)
|
||||
rows : list
|
||||
|
|
@ -151,7 +153,7 @@ class BaseParser():
|
|||
table = Table(cols, rows)
|
||||
table.page = self.page
|
||||
table.order = table_idx + 1
|
||||
table._bbox = self.table_bboxes()[table_idx]
|
||||
table._bbox = bbox
|
||||
return table
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -191,7 +193,7 @@ class BaseParser():
|
|||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
@ -225,7 +227,7 @@ class BaseParser():
|
|||
user_cols
|
||||
)
|
||||
table = self._generate_table(
|
||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||
table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
|
||||
_tables.append(table)
|
||||
|
||||
return _tables
|
||||
|
|
@ -467,8 +469,8 @@ class TextBaseParser(BaseParser):
|
|||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||
table = table.set_all_edges()
|
||||
self.record_parse_metadata(table)
|
||||
|
||||
|
|
|
|||
|
|
@ -104,10 +104,15 @@ class Hybrid(BaseParser):
|
|||
parser = self.table_bbox_parses[bbox]
|
||||
return parser._generate_columns_and_rows(bbox, table_idx)
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
bbox = self.table_bboxes()[table_idx]
|
||||
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||
parser = self.table_bbox_parses[bbox]
|
||||
return parser._generate_table(table_idx, cols, rows, **kwargs)
|
||||
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
||||
# Because hybrid can inject extraneous splits from both lattice and
|
||||
# network, remove lines / cols that are completely empty.
|
||||
df = table.df
|
||||
df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
|
||||
df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
|
||||
return table
|
||||
|
||||
@staticmethod
|
||||
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
||||
|
|
@ -223,6 +228,3 @@ class Hybrid(BaseParser):
|
|||
# Add the bboxes from network that haven't been merged
|
||||
for network_bbox in _network_bboxes:
|
||||
self.table_bbox_parses[network_bbox] = self.network_parser
|
||||
|
||||
def record_parse_metadata(self, table):
|
||||
super().record_parse_metadata(table)
|
||||
|
|
|
|||
|
|
@ -322,13 +322,13 @@ class Lattice(BaseParser):
|
|||
]
|
||||
return cols, rows, v_s, h_s
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
||||
v_s = kwargs.get("v_s")
|
||||
h_s = kwargs.get("h_s")
|
||||
if v_s is None or h_s is None:
|
||||
raise ValueError("No segments found on {}".format(self.rootname))
|
||||
|
||||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
table = self._initialize_new_table(table_idx, bbox, cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||
# set table border edges to True
|
||||
|
|
|
|||
|
|
@ -110,9 +110,21 @@ def download_url(url):
|
|||
shutil.move(f.name, filepath)
|
||||
return filepath
|
||||
|
||||
|
||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
||||
lattice_kwargs = [
|
||||
common_kwargs = [
|
||||
"flag_size",
|
||||
"margins",
|
||||
"split_text",
|
||||
"strip_text",
|
||||
"table_areas",
|
||||
"table_regions"
|
||||
]
|
||||
text_kwargs = common_kwargs + [
|
||||
"columns",
|
||||
"edge_tol",
|
||||
"row_tol",
|
||||
"column_tol"
|
||||
]
|
||||
lattice_kwargs = common_kwargs+ [
|
||||
"process_background",
|
||||
"line_scale",
|
||||
"copy_text",
|
||||
|
|
@ -124,33 +136,33 @@ lattice_kwargs = [
|
|||
"iterations",
|
||||
"resolution",
|
||||
]
|
||||
flavor_to_kwargs = {
|
||||
"stream": text_kwargs,
|
||||
"network": text_kwargs,
|
||||
"lattice": lattice_kwargs,
|
||||
"hybrid": text_kwargs + lattice_kwargs,
|
||||
}
|
||||
|
||||
|
||||
def validate_input(kwargs, flavor="lattice"):
|
||||
def check_intersection(parser_kwargs, input_kwargs):
|
||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||
if isec:
|
||||
raise ValueError(
|
||||
"{} cannot be used with flavor='{}'".format(
|
||||
",".join(sorted(isec)), flavor
|
||||
)
|
||||
parser_kwargs = flavor_to_kwargs[flavor]
|
||||
# s.difference(t): new set with elements in s but not in t
|
||||
isec = set(kwargs.keys()).difference(set(parser_kwargs))
|
||||
if isec:
|
||||
raise ValueError(
|
||||
"{} cannot be used with flavor='{}'".format(
|
||||
",".join(sorted(isec)), flavor
|
||||
)
|
||||
|
||||
if flavor == "lattice":
|
||||
check_intersection(stream_kwargs, kwargs)
|
||||
else:
|
||||
check_intersection(lattice_kwargs, kwargs)
|
||||
)
|
||||
|
||||
|
||||
def remove_extra(kwargs, flavor="lattice"):
|
||||
if flavor == "lattice":
|
||||
for key in kwargs.keys():
|
||||
if key in stream_kwargs:
|
||||
kwargs.pop(key)
|
||||
else:
|
||||
for key in kwargs.keys():
|
||||
if key in lattice_kwargs:
|
||||
kwargs.pop(key)
|
||||
parser_kwargs = flavor_to_kwargs[flavor]
|
||||
# Avoid "dictionary changed size during iteration"
|
||||
kwargs_keys = list(kwargs.keys())
|
||||
for key in kwargs_keys:
|
||||
if key not in parser_kwargs:
|
||||
kwargs.pop(key)
|
||||
return kwargs
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3701,6 +3701,52 @@ data_lattice_process_background = [
|
|||
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
|
||||
]
|
||||
|
||||
data_hybrid_process_background = [
|
||||
[
|
||||
"State",
|
||||
"Date",
|
||||
"Halt",
|
||||
"Halt",
|
||||
"Persons",
|
||||
"Persons",
|
||||
"Persons",
|
||||
"Persons",
|
||||
],
|
||||
["", "", "stations", "days", "directly", "trained", "counseled", "tested"],
|
||||
["", "", "", "", "reached", "", "", "for HIV"],
|
||||
["", "", "", "", "(in lakh)", "", "", ""],
|
||||
["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
|
||||
["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""],
|
||||
["", "19.12.2009", "", "", "", "", "", ""],
|
||||
["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"],
|
||||
["", "3.1.2010", "", "", "", "", "", ""],
|
||||
[
|
||||
"Maharashtra",
|
||||
"4.01.2010 to",
|
||||
"13",
|
||||
"26",
|
||||
"1.27",
|
||||
"5,680",
|
||||
"9,027",
|
||||
"4,153",
|
||||
],
|
||||
["", "1.2.2010", "", "", "", "", "", ""],
|
||||
[
|
||||
"Karnataka",
|
||||
"2.2.2010 to",
|
||||
"11",
|
||||
"19",
|
||||
"1.80",
|
||||
"5,741",
|
||||
"3,658",
|
||||
"3,183",
|
||||
],
|
||||
["", "22.2.2010", "", "", "", "", "", ""],
|
||||
["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"],
|
||||
["", "11.3.2010", "", "", "", "", "", ""],
|
||||
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
|
||||
]
|
||||
|
||||
data_lattice_copy_text = [
|
||||
["Plan Type", "County", "Plan Name", "Totals"],
|
||||
["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],
|
||||
|
|
|
|||
|
|
@ -321,6 +321,15 @@ def test_hybrid_vertical_header():
|
|||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_process_background():
|
||||
df = pd.DataFrame(data_hybrid_process_background)
|
||||
|
||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="hybrid", process_background=True)
|
||||
assert_frame_equal(df, tables[1].df)
|
||||
|
||||
|
||||
# Lattice parser tests
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
|
|
|||
Loading…
Reference in New Issue