Prep work for new hybrid parser introduction
Refactor parsers by moving common code to the base class
Maintain Python 3.5 compatibility by removing f"{}"
pull/153/head
parent
161f71230d
commit
37483ca202
|
|
@ -413,7 +413,7 @@ class Table(object):
|
|||
}
|
||||
return report
|
||||
|
||||
def record_metadata(self, parser):
|
||||
def record_parse_metadata(self, parser):
|
||||
"""Record data about the origin of the table
|
||||
"""
|
||||
self.flavor = parser.id
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@
|
|||
import os
|
||||
|
||||
from ..utils import (
|
||||
get_text_objects
|
||||
get_text_objects,
|
||||
get_table_index
|
||||
)
|
||||
from ..core import Table
|
||||
|
||||
|
|
@ -11,8 +12,26 @@ from ..core import Table
|
|||
class BaseParser(object):
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
def __init__(self, parser_id):
|
||||
def __init__(self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
shift_text=None,
|
||||
flag_size=False,
|
||||
):
|
||||
self.id = parser_id
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
|
||||
self.split_text = split_text
|
||||
self.strip_text = strip_text
|
||||
self.shift_text = shift_text
|
||||
|
||||
self.flag_size = flag_size
|
||||
|
||||
self.t_bbox = None
|
||||
|
||||
# For plotting details of parsing algorithms
|
||||
self.debug_info = {}
|
||||
|
|
@ -57,3 +76,38 @@ class BaseParser(object):
|
|||
table.page = self.page
|
||||
table.order = table_idx + 1
|
||||
return table
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
cell. Only useful for some parsers (e.g. Lattice), base method is a
|
||||
noop.
|
||||
"""
|
||||
return idx
|
||||
|
||||
def _compute_parse_errors(self, table):
|
||||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
for direction in ["vertical", "horizontal"]:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table,
|
||||
t,
|
||||
direction,
|
||||
split_text=self.split_text,
|
||||
flag_size=self.flag_size,
|
||||
strip_text=self.strip_text,
|
||||
)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
indices = type(self)._reduce_index(
|
||||
table,
|
||||
indices,
|
||||
shift_text=self.shift_text
|
||||
)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
return pos_errors
|
||||
|
||||
|
|
|
|||
|
|
@ -114,16 +114,18 @@ class Lattice(BaseParser):
|
|||
resolution=300,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__("lattice")
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
super().__init__(
|
||||
"lattice",
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
shift_text=shift_text or ["l", "t"],
|
||||
flag_size=flag_size,
|
||||
)
|
||||
self.process_background = process_background
|
||||
self.line_scale = line_scale
|
||||
self.copy_text = copy_text
|
||||
self.shift_text = shift_text or ["l", "t"]
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.strip_text = strip_text
|
||||
self.line_tol = line_tol
|
||||
self.joint_tol = joint_tol
|
||||
self.threshold_blocksize = threshold_blocksize
|
||||
|
|
@ -178,6 +180,7 @@ class Lattice(BaseParser):
|
|||
indices.append((r_idx, c_idx, text))
|
||||
return indices
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _copy_spanning_text(t, copy_text=None):
|
||||
"""Copies over text in empty spanning cells.
|
||||
|
|
@ -368,7 +371,7 @@ class Lattice(BaseParser):
|
|||
copy_text=self.copy_text
|
||||
)
|
||||
|
||||
table.record_metadata(self)
|
||||
table.record_parse_metadata(self)
|
||||
table.accuracy = accuracy
|
||||
|
||||
# for plotting
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import pandas as pd
|
|||
|
||||
from .base import BaseParser
|
||||
from ..core import TextEdges
|
||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||
from ..utils import (text_in_bbox, compute_accuracy,
|
||||
compute_whitespace)
|
||||
|
||||
|
||||
|
|
@ -61,22 +61,24 @@ class Stream(BaseParser):
|
|||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
split_text=False,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__("stream")
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
super().__init__(
|
||||
"stream",
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
flag_size=flag_size,
|
||||
)
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.strip_text = strip_text
|
||||
self.edge_tol = edge_tol
|
||||
self.row_tol = row_tol
|
||||
self.column_tol = column_tol
|
||||
|
|
@ -418,26 +420,10 @@ class Stream(BaseParser):
|
|||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
table = table.set_all_edges()
|
||||
|
||||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
for direction in ["vertical", "horizontal"]:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table,
|
||||
t,
|
||||
direction,
|
||||
split_text=self.split_text,
|
||||
flag_size=self.flag_size,
|
||||
strip_text=self.strip_text,
|
||||
)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
pos_errors = self._compute_parse_errors(table)
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
table.record_metadata(self)
|
||||
table.record_parse_metadata(self)
|
||||
|
||||
table.accuracy = accuracy
|
||||
|
||||
|
|
|
|||
|
|
@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path):
|
|||
pdf_path : str
|
||||
destination_path : str
|
||||
"""
|
||||
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
|
||||
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
|
||||
.format(
|
||||
destination_path=destination_path,
|
||||
pdf_path=pdf_path
|
||||
)
|
||||
gs_call = gs_call.encode().split()
|
||||
null = open(os.devnull, "wb")
|
||||
Ghostscript(*gs_call, stdout=null)
|
||||
|
|
@ -1038,19 +1042,28 @@ def compare_tables(left, right):
|
|||
differences = []
|
||||
if (diff_rows):
|
||||
differences.append(
|
||||
f"{abs(diff_rows)} "
|
||||
f"{'more' if diff_rows>0 else 'fewer'} rows"
|
||||
"{diff_rows} {more_fewer} rows".format(
|
||||
diff_rows=abs(diff_rows),
|
||||
more_fewer='more' if diff_rows>0 else 'fewer'
|
||||
)
|
||||
)
|
||||
if (diff_cols):
|
||||
differences.append(
|
||||
f"{abs(diff_cols)} "
|
||||
f"{'more' if diff_cols>0 else 'fewer'} columns"
|
||||
"{diff_cols} {more_fewer} columns".format(
|
||||
diff_cols=abs(diff_cols),
|
||||
more_fewer='more' if diff_cols>0 else 'fewer'
|
||||
)
|
||||
)
|
||||
if differences:
|
||||
differences_str = " and ".join(differences)
|
||||
print(f"Right has {differences_str} than left "
|
||||
f"[{right.shape[0]},{right.shape[1]}] vs "
|
||||
f"[{left.shape[0]},{left.shape[1]}]")
|
||||
print(
|
||||
"Right has {differences_str} than left "
|
||||
"{shape_right} vs {shape_left}".format(
|
||||
differences_str=differences_str,
|
||||
shape_right=[right.shape[0], right.shape[1]],
|
||||
shape_left=[left.shape[0], left.shape[1]]
|
||||
)
|
||||
)
|
||||
|
||||
table1, table2 = [left, right]
|
||||
name_table1, name_table2 = ["left", "right"]
|
||||
|
|
@ -1070,8 +1083,11 @@ def compare_tables(left, right):
|
|||
diff_df[name_table2] = lcol
|
||||
diff_df["Match"] = lcol == scol
|
||||
print(
|
||||
f"Column {i} different:\n"
|
||||
f"{diff_df}"
|
||||
"Column {i} different:\n"
|
||||
"{diff_df}".format(
|
||||
i=i,
|
||||
diff_df=diff_df
|
||||
)
|
||||
)
|
||||
break
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue