Prep work for new hybrid parser introduction

Refactor parsers by moving common code to the base class
Maintain Python 3.5 compatibility by removing f"{}"
pull/153/head
Frh 2020-04-19 11:32:22 -07:00
parent 697289e409
commit 583868756a
5 changed files with 106 additions and 47 deletions

View File

@ -413,7 +413,7 @@ class Table(object):
}
return report
def record_metadata(self, parser):
def record_parse_metadata(self, parser):
"""Record data about the origin of the table
"""
self.flavor = parser.id

View File

@ -3,7 +3,8 @@
import os
from ..utils import (
get_text_objects
get_text_objects,
get_table_index
)
from ..core import Table
@ -11,8 +12,26 @@ from ..core import Table
class BaseParser(object):
"""Defines a base parser.
"""
def __init__(self, parser_id):
def __init__(self,
parser_id,
table_regions=None,
table_areas=None,
split_text=False,
strip_text="",
shift_text=None,
flag_size=False,
):
self.id = parser_id
self.table_regions = table_regions
self.table_areas = table_areas
self.split_text = split_text
self.strip_text = strip_text
self.shift_text = shift_text
self.flag_size = flag_size
self.t_bbox = None
# For plotting details of parsing algorithms
self.debug_info = {}
@ -57,3 +76,38 @@ class BaseParser(object):
table.page = self.page
table.order = table_idx + 1
return table
@staticmethod
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell. Only useful for some parsers (e.g. Lattice), base method is a
noop.
"""
return idx
def _compute_parse_errors(self, table):
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = type(self)._reduce_index(
table,
indices,
shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
return pos_errors

View File

@ -114,16 +114,18 @@ class Lattice(BaseParser):
resolution=300,
**kwargs
):
super().__init__("lattice")
self.table_regions = table_regions
self.table_areas = table_areas
super().__init__(
"lattice",
table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
shift_text=shift_text or ["l", "t"],
flag_size=flag_size,
)
self.process_background = process_background
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text or ["l", "t"]
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.line_tol = line_tol
self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize
@ -178,6 +180,7 @@ class Lattice(BaseParser):
indices.append((r_idx, c_idx, text))
return indices
@staticmethod
def _copy_spanning_text(t, copy_text=None):
"""Copies over text in empty spanning cells.
@ -368,7 +371,7 @@ class Lattice(BaseParser):
copy_text=self.copy_text
)
table.record_metadata(self)
table.record_parse_metadata(self)
table.accuracy = accuracy
# for plotting

View File

@ -10,7 +10,7 @@ import pandas as pd
from .base import BaseParser
from ..core import TextEdges
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
from ..utils import (text_in_bbox, compute_accuracy,
compute_whitespace)
@ -61,22 +61,24 @@ class Stream(BaseParser):
table_regions=None,
table_areas=None,
columns=None,
split_text=False,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
super().__init__("stream")
self.table_regions = table_regions
self.table_areas = table_areas
super().__init__(
"stream",
table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
flag_size=flag_size,
)
self.columns = columns
self._validate_columns()
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@ -418,26 +420,10 @@ class Stream(BaseParser):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
pos_errors = self._compute_parse_errors(table)
accuracy = compute_accuracy([[100, pos_errors]])
table.record_metadata(self)
table.record_parse_metadata(self)
table.accuracy = accuracy

View File

@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path):
pdf_path : str
destination_path : str
"""
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
.format(
destination_path=destination_path,
pdf_path=pdf_path
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
@ -1038,19 +1042,28 @@ def compare_tables(left, right):
differences = []
if (diff_rows):
differences.append(
f"{abs(diff_rows)} "
f"{'more' if diff_rows>0 else 'fewer'} rows"
"{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows>0 else 'fewer'
)
)
if (diff_cols):
differences.append(
f"{abs(diff_cols)} "
f"{'more' if diff_cols>0 else 'fewer'} columns"
"{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols),
more_fewer='more' if diff_cols>0 else 'fewer'
)
)
if differences:
differences_str = " and ".join(differences)
print(f"Right has {differences_str} than left "
f"[{right.shape[0]},{right.shape[1]}] vs "
f"[{left.shape[0]},{left.shape[1]}]")
print(
"Right has {differences_str} than left "
"{shape_right} vs {shape_left}".format(
differences_str=differences_str,
shape_right=[right.shape[0], right.shape[1]],
shape_left=[left.shape[0], left.shape[1]]
)
)
table1, table2 = [left, right]
name_table1, name_table2 = ["left", "right"]
@ -1070,8 +1083,11 @@ def compare_tables(left, right):
diff_df[name_table2] = lcol
diff_df["Match"] = lcol == scol
print(
f"Column {i} different:\n"
f"{diff_df}"
"Column {i} different:\n"
"{diff_df}".format(
i=i,
diff_df=diff_df
)
)
break
else: