Add BaseParser docstring stub

pull/2/head
Vinayak Mehta 2018-09-06 05:55:05 +05:30
parent 96af09d9cd
commit d0005101a7
2 changed files with 19 additions and 22 deletions

View File

@ -91,8 +91,8 @@ class PDFHandler(object):
for p in self.pages]
tables = []
geometry = []
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
for p in pages:
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
t, g = parser.extract_tables(p)
tables.extend(t)
geometry.append(g)

View File

@ -32,15 +32,9 @@ copy_reg.pickle(types.MethodType, _reduce_method)
class BaseParser(object):
# init objects
# no tables condition
# convert pdf to image - lattice
# image processing - lattice
# user given table area condition
# scale image components to pdf components - lattice
# compute cols and rows for each table
# create table for each table
"""
"""
def _generate_layout(self, filename):
self.filename = filename
self.layout, self.dimensions = get_page_layout(
@ -229,7 +223,7 @@ class Stream(BaseParser):
return cols, rows
def _generate_table(self, table_idx, cols, rows):
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows)
table = table.set_all_edges()
pos_errors = []
@ -276,12 +270,6 @@ class Stream(BaseParser):
os.path.basename(self.basename)))
return [], self.g
if self.debug:
text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
self.g.text = text
self._generate_table_bbox()
_tables = []
@ -292,6 +280,12 @@ class Stream(BaseParser):
table = self._generate_table(table_idx, cols, rows)
_tables.append(table)
if self.debug:
text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
self.g.text = text
return _tables, self.g
@ -435,7 +429,12 @@ class Lattice(BaseParser):
return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, v_s, h_s):
def _generate_table(self, table_idx, cols, rows, **kwargs):
v_s = kwargs.get('v_s')
h_s = kwargs.get('h_s')
if v_s is None or h_s is None:
raise ValueError('No segments found on {}'.format(self.basename))
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol)
@ -495,19 +494,17 @@ class Lattice(BaseParser):
self._generate_image()
self._generate_table_bbox()
if self.debug:
self.g.images = (self.image, self.table_bbox_unscaled)
self.g.segments = (self.vertical_segments, self.horizontal_segments)
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(sorted(self.table_bbox.keys(),
key=lambda x: x[1], reverse=True)):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s, h_s)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table)
if self.debug:
self.g.images = (self.image, self.table_bbox_unscaled)
self.g.segments = (self.vertical_segments, self.horizontal_segments)
self.g.tables = _tables
return _tables, self.g