Prep work for new hybrid parser introduction
Refactor parsers by moving common code to the base class
Maintain Python 3.5 compatibility by removing f"{}"
pull/153/head
parent
161f71230d
commit
37483ca202
|
|
@ -413,7 +413,7 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
def record_metadata(self, parser):
|
def record_parse_metadata(self, parser):
|
||||||
"""Record data about the origin of the table
|
"""Record data about the origin of the table
|
||||||
"""
|
"""
|
||||||
self.flavor = parser.id
|
self.flavor = parser.id
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
get_text_objects
|
get_text_objects,
|
||||||
|
get_table_index
|
||||||
)
|
)
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
|
|
||||||
|
|
@ -11,8 +12,26 @@ from ..core import Table
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
def __init__(self, parser_id):
|
def __init__(self,
|
||||||
|
parser_id,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
shift_text=None,
|
||||||
|
flag_size=False,
|
||||||
|
):
|
||||||
self.id = parser_id
|
self.id = parser_id
|
||||||
|
self.table_regions = table_regions
|
||||||
|
self.table_areas = table_areas
|
||||||
|
|
||||||
|
self.split_text = split_text
|
||||||
|
self.strip_text = strip_text
|
||||||
|
self.shift_text = shift_text
|
||||||
|
|
||||||
|
self.flag_size = flag_size
|
||||||
|
|
||||||
|
self.t_bbox = None
|
||||||
|
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
self.debug_info = {}
|
self.debug_info = {}
|
||||||
|
|
@ -57,3 +76,38 @@ class BaseParser(object):
|
||||||
table.page = self.page
|
table.page = self.page
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
cell. Only useful for some parsers (e.g. Lattice), base method is a
|
||||||
|
noop.
|
||||||
|
"""
|
||||||
|
return idx
|
||||||
|
|
||||||
|
def _compute_parse_errors(self, table):
|
||||||
|
pos_errors = []
|
||||||
|
# TODO: have a single list in place of two directional ones?
|
||||||
|
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||||
|
for direction in ["vertical", "horizontal"]:
|
||||||
|
for t in self.t_bbox[direction]:
|
||||||
|
indices, error = get_table_index(
|
||||||
|
table,
|
||||||
|
t,
|
||||||
|
direction,
|
||||||
|
split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size,
|
||||||
|
strip_text=self.strip_text,
|
||||||
|
)
|
||||||
|
if indices[:2] != (-1, -1):
|
||||||
|
pos_errors.append(error)
|
||||||
|
indices = type(self)._reduce_index(
|
||||||
|
table,
|
||||||
|
indices,
|
||||||
|
shift_text=self.shift_text
|
||||||
|
)
|
||||||
|
for r_idx, c_idx, text in indices:
|
||||||
|
table.cells[r_idx][c_idx].text = text
|
||||||
|
return pos_errors
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -114,16 +114,18 @@ class Lattice(BaseParser):
|
||||||
resolution=300,
|
resolution=300,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__("lattice")
|
super().__init__(
|
||||||
self.table_regions = table_regions
|
"lattice",
|
||||||
self.table_areas = table_areas
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
shift_text=shift_text or ["l", "t"],
|
||||||
|
flag_size=flag_size,
|
||||||
|
)
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_scale = line_scale
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
self.shift_text = shift_text or ["l", "t"]
|
|
||||||
self.split_text = split_text
|
|
||||||
self.flag_size = flag_size
|
|
||||||
self.strip_text = strip_text
|
|
||||||
self.line_tol = line_tol
|
self.line_tol = line_tol
|
||||||
self.joint_tol = joint_tol
|
self.joint_tol = joint_tol
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
|
|
@ -178,6 +180,7 @@ class Lattice(BaseParser):
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _copy_spanning_text(t, copy_text=None):
|
def _copy_spanning_text(t, copy_text=None):
|
||||||
"""Copies over text in empty spanning cells.
|
"""Copies over text in empty spanning cells.
|
||||||
|
|
@ -368,7 +371,7 @@ class Lattice(BaseParser):
|
||||||
copy_text=self.copy_text
|
copy_text=self.copy_text
|
||||||
)
|
)
|
||||||
|
|
||||||
table.record_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import (text_in_bbox, compute_accuracy,
|
||||||
compute_whitespace)
|
compute_whitespace)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -61,22 +61,24 @@ class Stream(BaseParser):
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
columns=None,
|
columns=None,
|
||||||
split_text=False,
|
|
||||||
flag_size=False,
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
strip_text="",
|
strip_text="",
|
||||||
edge_tol=50,
|
edge_tol=50,
|
||||||
row_tol=2,
|
row_tol=2,
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__("stream")
|
super().__init__(
|
||||||
self.table_regions = table_regions
|
"stream",
|
||||||
self.table_areas = table_areas
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
flag_size=flag_size,
|
||||||
|
)
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
|
||||||
self.flag_size = flag_size
|
|
||||||
self.strip_text = strip_text
|
|
||||||
self.edge_tol = edge_tol
|
self.edge_tol = edge_tol
|
||||||
self.row_tol = row_tol
|
self.row_tol = row_tol
|
||||||
self.column_tol = column_tol
|
self.column_tol = column_tol
|
||||||
|
|
@ -418,26 +420,10 @@ class Stream(BaseParser):
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
|
|
||||||
pos_errors = []
|
pos_errors = self._compute_parse_errors(table)
|
||||||
# TODO: have a single list in place of two directional ones?
|
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
|
||||||
for direction in ["vertical", "horizontal"]:
|
|
||||||
for t in self.t_bbox[direction]:
|
|
||||||
indices, error = get_table_index(
|
|
||||||
table,
|
|
||||||
t,
|
|
||||||
direction,
|
|
||||||
split_text=self.split_text,
|
|
||||||
flag_size=self.flag_size,
|
|
||||||
strip_text=self.strip_text,
|
|
||||||
)
|
|
||||||
if indices[:2] != (-1, -1):
|
|
||||||
pos_errors.append(error)
|
|
||||||
for r_idx, c_idx, text in indices:
|
|
||||||
table.cells[r_idx][c_idx].text = text
|
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
table.record_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
|
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path):
|
||||||
pdf_path : str
|
pdf_path : str
|
||||||
destination_path : str
|
destination_path : str
|
||||||
"""
|
"""
|
||||||
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
|
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
|
||||||
|
.format(
|
||||||
|
destination_path=destination_path,
|
||||||
|
pdf_path=pdf_path
|
||||||
|
)
|
||||||
gs_call = gs_call.encode().split()
|
gs_call = gs_call.encode().split()
|
||||||
null = open(os.devnull, "wb")
|
null = open(os.devnull, "wb")
|
||||||
Ghostscript(*gs_call, stdout=null)
|
Ghostscript(*gs_call, stdout=null)
|
||||||
|
|
@ -1038,19 +1042,28 @@ def compare_tables(left, right):
|
||||||
differences = []
|
differences = []
|
||||||
if (diff_rows):
|
if (diff_rows):
|
||||||
differences.append(
|
differences.append(
|
||||||
f"{abs(diff_rows)} "
|
"{diff_rows} {more_fewer} rows".format(
|
||||||
f"{'more' if diff_rows>0 else 'fewer'} rows"
|
diff_rows=abs(diff_rows),
|
||||||
|
more_fewer='more' if diff_rows>0 else 'fewer'
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if (diff_cols):
|
if (diff_cols):
|
||||||
differences.append(
|
differences.append(
|
||||||
f"{abs(diff_cols)} "
|
"{diff_cols} {more_fewer} columns".format(
|
||||||
f"{'more' if diff_cols>0 else 'fewer'} columns"
|
diff_cols=abs(diff_cols),
|
||||||
|
more_fewer='more' if diff_cols>0 else 'fewer'
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if differences:
|
if differences:
|
||||||
differences_str = " and ".join(differences)
|
differences_str = " and ".join(differences)
|
||||||
print(f"Right has {differences_str} than left "
|
print(
|
||||||
f"[{right.shape[0]},{right.shape[1]}] vs "
|
"Right has {differences_str} than left "
|
||||||
f"[{left.shape[0]},{left.shape[1]}]")
|
"{shape_right} vs {shape_left}".format(
|
||||||
|
differences_str=differences_str,
|
||||||
|
shape_right=[right.shape[0], right.shape[1]],
|
||||||
|
shape_left=[left.shape[0], left.shape[1]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
table1, table2 = [left, right]
|
table1, table2 = [left, right]
|
||||||
name_table1, name_table2 = ["left", "right"]
|
name_table1, name_table2 = ["left", "right"]
|
||||||
|
|
@ -1070,8 +1083,11 @@ def compare_tables(left, right):
|
||||||
diff_df[name_table2] = lcol
|
diff_df[name_table2] = lcol
|
||||||
diff_df["Match"] = lcol == scol
|
diff_df["Match"] = lcol == scol
|
||||||
print(
|
print(
|
||||||
f"Column {i} different:\n"
|
"Column {i} different:\n"
|
||||||
f"{diff_df}"
|
"{diff_df}".format(
|
||||||
|
i=i,
|
||||||
|
diff_df=diff_df
|
||||||
|
)
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue