Add docstring stubs

pull/2/head
Vinayak Mehta 2018-09-05 19:35:46 +05:30
parent bf63432494
commit a4d3165e94
7 changed files with 543 additions and 0 deletions

View File

@ -4,6 +4,9 @@ import numpy as np
class Cell(object):
"""
"""
def __init__(self, x1, y1, x2, y2):
self.x1 = x1
self.y1 = y1
@ -27,23 +30,56 @@ class Cell(object):
pass
def add_text(self, text):
"""
Parameters
----------
text
"""
self.text = ''.join([self.text, text])
def get_text(self):
"""
Returns
-------
"""
return self.text
def add_object(self, t_object):
"""
Parameters
----------
t_object
"""
self.text_objects.append(t_object)
def get_objects(self):
"""
Returns
-------
"""
return self.text_objects
def get_bounded_edges(self):
"""
Returns
-------
"""
self.bounded_edges = self.top + self.bottom + self.left + self.right
return self.bounded_edges
class Table(object):
"""
"""
def __init__(self, cols, rows):
self.cols = cols
self.rows = rows
@ -60,6 +96,12 @@ class Table(object):
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
def set_all_edges(self):
"""
Returns
-------
"""
for r in range(len(self.rows)):
for c in range(len(self.cols)):
self.cells[r][c].left = True
@ -69,6 +111,12 @@ class Table(object):
return self
def set_border_edges(self):
"""
Returns
-------
"""
for r in range(len(self.rows)):
self.cells[r][0].left = True
self.cells[r][len(self.cols) - 1].right = True
@ -78,6 +126,18 @@ class Table(object):
return self
def set_edges(self, vertical, horizontal, jtol=2):
"""
Parameters
----------
vertical
horizontal
jtol
Returns
-------
"""
for v in vertical:
# find closest x coord
# iterate over y coords and find closest points
@ -185,6 +245,12 @@ class Table(object):
return self
def set_spanning(self):
"""
Returns
-------
"""
for r in range(len(self.rows)):
for c in range(len(self.cols)):
bound = self.cells[r][c].get_bounded_edges()
@ -225,6 +291,12 @@ class Table(object):
@property
def data(self):
"""
Returns
-------
"""
d = []
for r in range(len(self.rows)):
d.append([self.cells[r][c].get_text().strip()
@ -233,6 +305,12 @@ class Table(object):
@property
def df(self):
"""
Returns
-------
"""
return self._df
@df.setter
@ -241,6 +319,12 @@ class Table(object):
@property
def shape(self):
"""
Returns
-------
"""
return self._shape
@shape.setter
@ -249,6 +333,12 @@ class Table(object):
@property
def accuracy(self):
"""
Returns
-------
"""
return self._accuracy
@accuracy.setter
@ -257,6 +347,12 @@ class Table(object):
@property
def whitespace(self):
"""
Returns
-------
"""
return self._whitespace
@whitespace.setter
@ -265,6 +361,12 @@ class Table(object):
@property
def order(self):
"""
Returns
-------
"""
return self._order
@order.setter
@ -273,6 +375,12 @@ class Table(object):
@property
def page(self):
"""
Returns
-------
"""
return self._page
@page.setter
@ -281,6 +389,12 @@ class Table(object):
@property
def parsing_report(self):
"""
Returns
-------
"""
# pretty?
report = {
'accuracy': self._accuracy,
@ -292,6 +406,9 @@ class Table(object):
class TableList(list):
"""
"""
def __init__(self, tables):
self._tables = tables
@ -307,6 +424,9 @@ class TableList(list):
class Geometry(object):
"""
"""
def __init__(self):
self._text = []
self._images = ()
@ -315,6 +435,12 @@ class Geometry(object):
@property
def text(self):
"""
Returns
-------
"""
return self._text
@text.setter
@ -323,6 +449,12 @@ class Geometry(object):
@property
def images(self):
"""
Returns
-------
"""
return self._images
@images.setter
@ -331,6 +463,12 @@ class Geometry(object):
@property
def segments(self):
"""
Returns
-------
"""
return self._segments
@segments.setter
@ -339,6 +477,12 @@ class Geometry(object):
@property
def tables(self):
"""
Returns
-------
"""
return self._tables
@tables.setter
@ -347,6 +491,9 @@ class Geometry(object):
class GeometryList(object):
"""
"""
def __init__(self, geometry):
self._text = [g.text for g in geometry]
self._images = [g.images for g in geometry]
@ -363,16 +510,40 @@ class GeometryList(object):
@property
def text(self):
"""
Returns
-------
"""
return self._text
@property
def images(self):
"""
Returns
-------
"""
return self._images
@property
def segments(self):
"""
Returns
-------
"""
return self._segments
@property
def tables(self):
"""
Returns
-------
"""
return self._tables

View File

@ -9,6 +9,9 @@ from .utils import get_page_layout, get_text_objects, get_rotation
class PDFHandler(object):
"""
"""
def __init__(self, filename, pages='1'):
self.filename = filename
if not self.filename.endswith('.pdf'):
@ -71,6 +74,17 @@ class PDFHandler(object):
outfile.write(f)
def parse(self, mesh=False, **kwargs):
"""
Parameters
----------
mesh
kwargs
Returns
-------
"""
for p in self.pages:
self.__save_page(self.filename, p, self.temp)
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))

View File

@ -8,6 +8,19 @@ from .utils import merge_tuples
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
"""
Parameters
----------
imagename
invert
blocksize
c
Returns
-------
"""
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@ -21,6 +34,19 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
"""
Parameters
----------
threshold
direction
scale
iterations
Returns
-------
"""
lines = []
if direction == 'vertical':
@ -57,6 +83,17 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
def find_table_contours(vertical, horizontal):
"""
Parameters
----------
vertical
horizontal
Returns
-------
"""
mask = vertical + horizontal
try:
@ -76,6 +113,18 @@ def find_table_contours(vertical, horizontal):
def find_table_joints(contours, vertical, horizontal):
"""
Parameters
----------
contours
vertical
horizontal
Returns
-------
"""
joints = np.bitwise_and(vertical, horizontal)
tables = {}
for c in contours:
@ -100,6 +149,17 @@ def find_table_joints(contours, vertical, horizontal):
def remove_lines(threshold, line_scale=15):
"""
Parameters
----------
threshold
line_scale
Returns
-------
"""
size = threshold.shape[0] // line_scale
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
@ -117,6 +177,17 @@ def remove_lines(threshold, line_scale=15):
def find_cuts(threshold, char_scale=200):
"""
Parameters
----------
threshold
char_scale
Returns
-------
"""
size = threshold.shape[0] // char_scale
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))

View File

@ -2,6 +2,19 @@ from .handlers import PDFHandler
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
"""
Parameters
----------
filepath
pages
mesh
kwargs
Returns
-------
"""
# explicit type conversion
p = PDFHandler(filepath, pages)
tables, __ = p.parse(mesh=mesh, **kwargs)

View File

@ -32,6 +32,9 @@ copy_reg.pickle(types.MethodType, _reduce_method)
class Stream:
"""
"""
def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
debug=False):
@ -134,6 +137,16 @@ class Stream:
return cols
def extract_tables(self, pdfname):
"""
Parameters
----------
pdfname
Returns
-------
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
lttextlh = get_text_objects(layout, ltype="lh")
@ -265,6 +278,9 @@ class Stream:
class Lattice:
"""
"""
def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
blocksize=15, threshold_constant=-2, scale=15, iterations=0,
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
@ -328,6 +344,16 @@ class Lattice:
return t
def extract_tables(self, pdfname):
"""
Parameters
----------
pdfname
Returns
-------
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
lttextlh = get_text_objects(layout, ltype="lh")

View File

@ -6,6 +6,16 @@ from .handlers import PDFHandler
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
"""
Parameters
----------
filepath
pages
mesh
geometry_type
kwargs
"""
# explicit type conversion
p = PDFHandler(filepath, pages)
kwargs.update({'debug': geometry_type})

View File

@ -19,16 +19,52 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
def translate(x1, x2):
"""
Parameters
----------
x1
x2
Returns
-------
"""
x2 += x1
return x2
def scale(x, s):
"""
Parameters
----------
x
s
Returns
-------
"""
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
"""
Parameters
----------
x1
y1
x2
y2
angle
Returns
-------
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
@ -41,6 +77,17 @@ def rotate(x1, y1, x2, y2, angle):
def scale_to_image(k, factors):
"""
Parameters
----------
k
factors
Returns
-------
"""
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
@ -52,6 +99,19 @@ def scale_to_image(k, factors):
def scale_to_pdf(tables, v_segments, h_segments, factors):
"""
Parameters
----------
tables
v_segments
h_segments
factors
Returns
-------
"""
scaling_factor_x, scaling_factor_y, img_y = factors
tables_new = {}
for k in tables.keys():
@ -84,6 +144,16 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
def setup_logging(log_filepath):
"""
Parameters
----------
log_filepath
Returns
-------
"""
logger = logging.getLogger("app_logger")
logger.setLevel(logging.DEBUG)
# Log File Handler (Associating one log file per webservice run)
@ -105,6 +175,18 @@ def setup_logging(log_filepath):
def get_rotation(lttextlh, lttextlv, ltchar):
"""
Parameters
----------
lttextlh
lttextlv
ltchar
Returns
-------
"""
rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
@ -116,6 +198,18 @@ def get_rotation(lttextlh, lttextlv, ltchar):
def segments_bbox(bbox, v_segments, h_segments):
"""
Parameters
----------
bbox
v_segments
h_segments
Returns
-------
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
@ -126,6 +220,17 @@ def segments_bbox(bbox, v_segments, h_segments):
def text_in_bbox(bbox, text):
"""
Parameters
----------
bbox
text
Returns
-------
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
@ -135,6 +240,17 @@ def text_in_bbox(bbox, text):
def remove_close_values(ar, mtol=2):
"""
Parameters
----------
ar
mtol
Returns
-------
"""
ret = []
for a in ar:
if not ret:
@ -149,6 +265,17 @@ def remove_close_values(ar, mtol=2):
def merge_close_values(ar, mtol=2):
"""
Parameters
----------
ar
mtol
Returns
-------
"""
ret = []
for a in ar:
if not ret:
@ -164,6 +291,17 @@ def merge_close_values(ar, mtol=2):
def flag_on_size(textline, direction):
"""
Parameters
----------
textline
direction
Returns
-------
"""
if direction == 'horizontal':
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
elif direction == 'vertical':
@ -190,6 +328,19 @@ def flag_on_size(textline, direction):
def split_textline(table, textline, direction, flag_size=True):
"""
Parameters
----------
table
textline
direction
flag_size
Returns
-------
"""
idx = 0
cut_text = []
bbox = textline.bbox
@ -241,6 +392,20 @@ def split_textline(table, textline, direction, flag_size=True):
def get_table_index(table, t, direction, split_text=False, flag_size=True):
"""
Parameters
----------
table
t
direction
split_text
flag_size
Returns
-------
"""
r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)):
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
@ -284,6 +449,16 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
def compute_accuracy(error_weights):
"""
Parameters
----------
error_weights
Returns
-------
"""
SCORE_VAL = 100
try:
score = 0
@ -299,6 +474,16 @@ def compute_accuracy(error_weights):
def remove_empty(d):
"""
Parameters
----------
d
Returns
-------
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
@ -309,6 +494,16 @@ def remove_empty(d):
def count_empty(d):
"""
Parameters
----------
d
Returns
-------
"""
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
@ -334,11 +529,33 @@ def count_empty(d):
def encode_(ar):
"""
Parameters
----------
ar
Returns
-------
"""
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar
def get_text_objects(layout, ltype="char", t=None):
"""
Parameters
----------
layout
ltype
t
Returns
-------
"""
if ltype == "char":
LTObject = LTChar
elif ltype == "lh":
@ -360,6 +577,21 @@ def get_text_objects(layout, ltype="char", t=None):
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
"""
Parameters
----------
pname
char_margin
line_margin
word_margin
detect_vertical
all_texts
Returns
-------
"""
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
@ -383,6 +615,12 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
def merge_tuples(tuples):
"""
Parameters
----------
tuples
"""
merged = list(tuples[0])
for s, e in tuples:
if s <= merged[1]: