Remove docstrings
parent
08cbababca
commit
bf63432494
|
|
@ -8,35 +8,6 @@ from .utils import merge_tuples
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
imagename : string
|
|
||||||
Path to image file.
|
|
||||||
|
|
||||||
invert : bool
|
|
||||||
Whether or not to invert the image. Useful when pdfs have
|
|
||||||
tables with lines in background.
|
|
||||||
(optional, default: False)
|
|
||||||
|
|
||||||
blocksize: int
|
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
|
||||||
|
|
||||||
c: float
|
|
||||||
Constant subtracted from the mean or weighted mean
|
|
||||||
(see the details below). Normally, it is positive but may be
|
|
||||||
zero or negative as well.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
img : object
|
|
||||||
numpy.ndarray representing the original image.
|
|
||||||
|
|
||||||
threshold : object
|
|
||||||
numpy.ndarray representing the thresholded image.
|
|
||||||
"""
|
|
||||||
img = cv2.imread(imagename)
|
img = cv2.imread(imagename)
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
|
|
@ -50,38 +21,6 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
|
|
||||||
def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
|
||||||
transformations on an image.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
threshold : object
|
|
||||||
numpy.ndarray representing the thresholded image.
|
|
||||||
|
|
||||||
direction : string
|
|
||||||
Specifies whether to find vertical or horizontal lines.
|
|
||||||
(default: 'horizontal')
|
|
||||||
|
|
||||||
scale : int
|
|
||||||
Used to divide the height/width to get a structuring element
|
|
||||||
for morph transform.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
iterations : int
|
|
||||||
Number of iterations for dilation.
|
|
||||||
(optional, default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
dmask : object
|
|
||||||
numpy.ndarray representing pixels where vertical/horizontal
|
|
||||||
lines lie.
|
|
||||||
|
|
||||||
lines : list
|
|
||||||
List of tuples representing vertical/horizontal lines with
|
|
||||||
coordinates relative to a left-top origin in
|
|
||||||
OpenCV's coordinate space.
|
|
||||||
"""
|
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
if direction == 'vertical':
|
if direction == 'vertical':
|
||||||
|
|
@ -118,23 +57,6 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
||||||
|
|
||||||
|
|
||||||
def find_table_contours(vertical, horizontal):
|
def find_table_contours(vertical, horizontal):
|
||||||
"""Finds table boundaries using OpenCV's findContours.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
vertical : object
|
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
|
||||||
|
|
||||||
horizontal : object
|
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cont : list
|
|
||||||
List of tuples representing table boundaries. Each tuple is of
|
|
||||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
|
||||||
h -> height in OpenCV's coordinate space.
|
|
||||||
"""
|
|
||||||
mask = vertical + horizontal
|
mask = vertical + horizontal
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -154,30 +76,6 @@ def find_table_contours(vertical, horizontal):
|
||||||
|
|
||||||
|
|
||||||
def find_table_joints(contours, vertical, horizontal):
|
def find_table_joints(contours, vertical, horizontal):
|
||||||
"""Finds joints/intersections present inside each table boundary.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
contours : list
|
|
||||||
List of tuples representing table boundaries. Each tuple is of
|
|
||||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
|
||||||
h -> height in OpenCV's coordinate space.
|
|
||||||
|
|
||||||
vertical : object
|
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
|
||||||
|
|
||||||
horizontal : object
|
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tables : dict
|
|
||||||
Dict with table boundaries as keys and list of intersections
|
|
||||||
in that boundary as their value.
|
|
||||||
|
|
||||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
|
||||||
and (x2, y2) -> rt in OpenCV's coordinate space.
|
|
||||||
"""
|
|
||||||
joints = np.bitwise_and(vertical, horizontal)
|
joints = np.bitwise_and(vertical, horizontal)
|
||||||
tables = {}
|
tables = {}
|
||||||
for c in contours:
|
for c in contours:
|
||||||
|
|
@ -202,23 +100,6 @@ def find_table_joints(contours, vertical, horizontal):
|
||||||
|
|
||||||
|
|
||||||
def remove_lines(threshold, line_scale=15):
|
def remove_lines(threshold, line_scale=15):
|
||||||
"""Removes lines from a thresholded image.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
threshold : object
|
|
||||||
numpy.ndarray representing the thresholded image.
|
|
||||||
|
|
||||||
line_scale : int
|
|
||||||
Line scaling factor.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
threshold : object
|
|
||||||
numpy.ndarray representing the thresholded image
|
|
||||||
with horizontal and vertical lines removed.
|
|
||||||
"""
|
|
||||||
size = threshold.shape[0] // line_scale
|
size = threshold.shape[0] // line_scale
|
||||||
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
|
|
@ -236,22 +117,6 @@ def remove_lines(threshold, line_scale=15):
|
||||||
|
|
||||||
|
|
||||||
def find_cuts(threshold, char_scale=200):
|
def find_cuts(threshold, char_scale=200):
|
||||||
"""Finds cuts made by text projections on y-axis.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
threshold : object
|
|
||||||
numpy.ndarray representing the thresholded image.
|
|
||||||
|
|
||||||
char_scale : int
|
|
||||||
Char scaling factor.
|
|
||||||
(optional, default: 200)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
y_cuts : list
|
|
||||||
List of cuts on y-axis.
|
|
||||||
"""
|
|
||||||
size = threshold.shape[0] // char_scale
|
size = threshold.shape[0] // char_scale
|
||||||
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
|
|
||||||
|
|
|
||||||
382
camelot/utils.py
382
camelot/utils.py
|
|
@ -19,61 +19,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
"""Translates x2 by x1.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x1 : float
|
|
||||||
|
|
||||||
x2 : float
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
x2 : float
|
|
||||||
"""
|
|
||||||
x2 += x1
|
x2 += x1
|
||||||
return x2
|
return x2
|
||||||
|
|
||||||
|
|
||||||
def scale(x, s):
|
def scale(x, s):
|
||||||
"""Scales x by scaling factor s.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : float
|
|
||||||
|
|
||||||
s : float
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
x : float
|
|
||||||
"""
|
|
||||||
x *= s
|
x *= s
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def rotate(x1, y1, x2, y2, angle):
|
def rotate(x1, y1, x2, y2, angle):
|
||||||
"""Rotates point x2, y2 about point x1, y1 by angle.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x1 : float
|
|
||||||
|
|
||||||
y1 : float
|
|
||||||
|
|
||||||
x2 : float
|
|
||||||
|
|
||||||
y2 : float
|
|
||||||
|
|
||||||
angle : float
|
|
||||||
Angle in radians.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
xnew : float
|
|
||||||
|
|
||||||
ynew : float
|
|
||||||
"""
|
|
||||||
s = np.sin(angle)
|
s = np.sin(angle)
|
||||||
c = np.cos(angle)
|
c = np.cos(angle)
|
||||||
x2 = translate(-x1, x2)
|
x2 = translate(-x1, x2)
|
||||||
|
|
@ -86,28 +41,6 @@ def rotate(x1, y1, x2, y2, angle):
|
||||||
|
|
||||||
|
|
||||||
def scale_to_image(k, factors):
|
def scale_to_image(k, factors):
|
||||||
"""Translates and scales PDFMiner coordinates to OpenCV's coordinate
|
|
||||||
space.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
k : tuple
|
|
||||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
|
||||||
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
|
|
||||||
space.
|
|
||||||
|
|
||||||
factors : tuple
|
|
||||||
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
|
|
||||||
first two elements are scaling factors and pdf_y is height of
|
|
||||||
pdf.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
knew : tuple
|
|
||||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
|
||||||
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
|
|
||||||
space.
|
|
||||||
"""
|
|
||||||
x1, y1, x2, y2 = k
|
x1, y1, x2, y2 = k
|
||||||
scaling_factor_x, scaling_factor_y, pdf_y = factors
|
scaling_factor_x, scaling_factor_y, pdf_y = factors
|
||||||
x1 = scale(x1, scaling_factor_x)
|
x1 = scale(x1, scaling_factor_x)
|
||||||
|
|
@ -119,34 +52,6 @@ def scale_to_image(k, factors):
|
||||||
|
|
||||||
|
|
||||||
def scale_to_pdf(tables, v_segments, h_segments, factors):
|
def scale_to_pdf(tables, v_segments, h_segments, factors):
|
||||||
"""Translates and scales OpenCV coordinates to PDFMiner's coordinate
|
|
||||||
space.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
tables : dict
|
|
||||||
Dict with table boundaries as keys and list of intersections
|
|
||||||
in that boundary as their value.
|
|
||||||
|
|
||||||
v_segments : list
|
|
||||||
List of vertical line segments.
|
|
||||||
|
|
||||||
h_segments : list
|
|
||||||
List of horizontal line segments.
|
|
||||||
|
|
||||||
factors : tuple
|
|
||||||
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
|
|
||||||
first two elements are scaling factors and img_y is height of
|
|
||||||
image.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tables_new : dict
|
|
||||||
|
|
||||||
v_segments_new : dict
|
|
||||||
|
|
||||||
h_segments_new : dict
|
|
||||||
"""
|
|
||||||
scaling_factor_x, scaling_factor_y, img_y = factors
|
scaling_factor_x, scaling_factor_y, img_y = factors
|
||||||
tables_new = {}
|
tables_new = {}
|
||||||
for k in tables.keys():
|
for k in tables.keys():
|
||||||
|
|
@ -179,12 +84,6 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(log_filepath):
|
def setup_logging(log_filepath):
|
||||||
"""Setup logging
|
|
||||||
Args:
|
|
||||||
log_filepath (string): Path to log file
|
|
||||||
Returns:
|
|
||||||
logging.Logger: Logger object
|
|
||||||
"""
|
|
||||||
logger = logging.getLogger("app_logger")
|
logger = logging.getLogger("app_logger")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
# Log File Handler (Associating one log file per webservice run)
|
# Log File Handler (Associating one log file per webservice run)
|
||||||
|
|
@ -206,27 +105,6 @@ def setup_logging(log_filepath):
|
||||||
|
|
||||||
|
|
||||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
"""Detects if text in table is vertical or not using the current
|
|
||||||
transformation matrix (CTM) and returns its orientation.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
lttextlh : list
|
|
||||||
List of PDFMiner LTTextLineHorizontal objects.
|
|
||||||
|
|
||||||
lttextlv : list
|
|
||||||
List of PDFMiner LTTextLineVertical objects.
|
|
||||||
|
|
||||||
ltchar : list
|
|
||||||
List of PDFMiner LTChar objects.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rotation : string
|
|
||||||
{'', 'left', 'right'}
|
|
||||||
'' if text in table is upright, 'left' if rotated 90 degree
|
|
||||||
anti-clockwise and 'right' if rotated 90 degree clockwise.
|
|
||||||
"""
|
|
||||||
rotation = ''
|
rotation = ''
|
||||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
||||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
||||||
|
|
@ -238,29 +116,6 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
|
|
||||||
|
|
||||||
def segments_bbox(bbox, v_segments, h_segments):
|
def segments_bbox(bbox, v_segments, h_segments):
|
||||||
"""Returns all line segments present inside a
|
|
||||||
table's bounding box.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
bbox : tuple
|
|
||||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
|
||||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
|
|
||||||
|
|
||||||
v_segments : list
|
|
||||||
List of vertical line segments.
|
|
||||||
|
|
||||||
h_segments : list
|
|
||||||
List of vertical horizontal segments.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
v_s : list
|
|
||||||
List of vertical line segments that lie inside table.
|
|
||||||
|
|
||||||
h_s : list
|
|
||||||
List of horizontal line segments that lie inside table.
|
|
||||||
"""
|
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
||||||
|
|
@ -271,23 +126,6 @@ def segments_bbox(bbox, v_segments, h_segments):
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a
|
|
||||||
table's bounding box.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
bbox : tuple
|
|
||||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
|
||||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
|
|
||||||
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t_bbox : list
|
|
||||||
List of PDFMiner text objects that lie inside table.
|
|
||||||
"""
|
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
||||||
|
|
@ -297,20 +135,6 @@ def text_in_bbox(bbox, text):
|
||||||
|
|
||||||
|
|
||||||
def remove_close_values(ar, mtol=2):
|
def remove_close_values(ar, mtol=2):
|
||||||
"""Removes values which are within a tolerance of mtol of another value
|
|
||||||
present in list.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ar : list
|
|
||||||
|
|
||||||
mtol : int
|
|
||||||
(optional, default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ret : list
|
|
||||||
"""
|
|
||||||
ret = []
|
ret = []
|
||||||
for a in ar:
|
for a in ar:
|
||||||
if not ret:
|
if not ret:
|
||||||
|
|
@ -325,20 +149,6 @@ def remove_close_values(ar, mtol=2):
|
||||||
|
|
||||||
|
|
||||||
def merge_close_values(ar, mtol=2):
|
def merge_close_values(ar, mtol=2):
|
||||||
"""Merges values which are within a tolerance of mtol by calculating
|
|
||||||
a moving mean.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ar : list
|
|
||||||
|
|
||||||
mtol : int
|
|
||||||
(optional, default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ret : list
|
|
||||||
"""
|
|
||||||
ret = []
|
ret = []
|
||||||
for a in ar:
|
for a in ar:
|
||||||
if not ret:
|
if not ret:
|
||||||
|
|
@ -354,22 +164,6 @@ def merge_close_values(ar, mtol=2):
|
||||||
|
|
||||||
|
|
||||||
def flag_on_size(textline, direction):
|
def flag_on_size(textline, direction):
|
||||||
"""Flags a super/subscript by enclosing it with <s></s>. May give
|
|
||||||
false positives.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
textline : list
|
|
||||||
List of PDFMiner LTChar objects.
|
|
||||||
|
|
||||||
direction : string
|
|
||||||
{'horizontal', 'vertical'}
|
|
||||||
Direction of the PDFMiner LTTextLine object.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
fstring : string
|
|
||||||
"""
|
|
||||||
if direction == 'horizontal':
|
if direction == 'horizontal':
|
||||||
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||||
elif direction == 'vertical':
|
elif direction == 'vertical':
|
||||||
|
|
@ -396,33 +190,6 @@ def flag_on_size(textline, direction):
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=True):
|
def split_textline(table, textline, direction, flag_size=True):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
|
||||||
multiple rows/columns.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table : object
|
|
||||||
camelot.pdf.Pdf
|
|
||||||
|
|
||||||
textline : object
|
|
||||||
PDFMiner LTTextLine object.
|
|
||||||
|
|
||||||
direction : string
|
|
||||||
{'horizontal', 'vertical'}
|
|
||||||
Direction of the PDFMiner LTTextLine object.
|
|
||||||
|
|
||||||
flag_size : bool
|
|
||||||
Whether or not to highlight a substring using <s></s>
|
|
||||||
if its size is different from rest of the string, useful for
|
|
||||||
super and subscripts.
|
|
||||||
(optional, default: True)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
grouped_chars : list
|
|
||||||
List of tuples of the form (idx, text) where idx is the index
|
|
||||||
of row/column and text is the an lttextline substring.
|
|
||||||
"""
|
|
||||||
idx = 0
|
idx = 0
|
||||||
cut_text = []
|
cut_text = []
|
||||||
bbox = textline.bbox
|
bbox = textline.bbox
|
||||||
|
|
@ -474,47 +241,6 @@ def split_textline(table, textline, direction, flag_size=True):
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||||
"""Gets indices of the cell where given text object lies by
|
|
||||||
comparing their y and x-coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table : object
|
|
||||||
camelot.table.Table
|
|
||||||
|
|
||||||
t : object
|
|
||||||
PDFMiner LTTextLine object.
|
|
||||||
|
|
||||||
direction : string
|
|
||||||
{'horizontal', 'vertical'}
|
|
||||||
Direction of the PDFMiner LTTextLine object.
|
|
||||||
|
|
||||||
split_text : bool
|
|
||||||
Whether or not to split a text line if it spans across
|
|
||||||
multiple cells.
|
|
||||||
(optional, default: False)
|
|
||||||
|
|
||||||
flag_size : bool
|
|
||||||
Whether or not to highlight a substring using <s></s>
|
|
||||||
if its size is different from rest of the string, useful for
|
|
||||||
super and subscripts.
|
|
||||||
(optional, default: True)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
indices : list
|
|
||||||
List of tuples of the form (idx, text) where idx is the index
|
|
||||||
of row/column and text is the an lttextline substring.
|
|
||||||
|
|
||||||
error : float
|
|
||||||
Assignment error, percentage of text area that lies outside
|
|
||||||
a cell.
|
|
||||||
+-------+
|
|
||||||
| |
|
|
||||||
| [Text bounding box]
|
|
||||||
| |
|
|
||||||
+-------+
|
|
||||||
"""
|
|
||||||
r_idx, c_idx = [-1] * 2
|
r_idx, c_idx = [-1] * 2
|
||||||
for r in range(len(table.rows)):
|
for r in range(len(table.rows)):
|
||||||
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
|
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
|
||||||
|
|
@ -558,20 +284,6 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
"""Calculates score based on weights assigned to various parameters,
|
|
||||||
and their error percentages.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
error_weights : list
|
|
||||||
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
|
|
||||||
where pn is the weight assigned to list of errors en.
|
|
||||||
Sum of pn should be equal to 100.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
score : float
|
|
||||||
"""
|
|
||||||
SCORE_VAL = 100
|
SCORE_VAL = 100
|
||||||
try:
|
try:
|
||||||
score = 0
|
score = 0
|
||||||
|
|
@ -587,16 +299,6 @@ def compute_accuracy(error_weights):
|
||||||
|
|
||||||
|
|
||||||
def remove_empty(d):
|
def remove_empty(d):
|
||||||
"""Removes empty rows and columns from a two-dimensional list.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
d : list
|
|
||||||
"""
|
|
||||||
for i, row in enumerate(d):
|
for i, row in enumerate(d):
|
||||||
if row == [''] * len(row):
|
if row == [''] * len(row):
|
||||||
d.pop(i)
|
d.pop(i)
|
||||||
|
|
@ -607,23 +309,6 @@ def remove_empty(d):
|
||||||
|
|
||||||
|
|
||||||
def count_empty(d):
|
def count_empty(d):
|
||||||
"""Counts empty rows and columns in a two-dimensional list.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
n_empty_rows : list
|
|
||||||
Number of empty rows.
|
|
||||||
|
|
||||||
n_empty_cols : list
|
|
||||||
Number of empty columns.
|
|
||||||
|
|
||||||
empty_p : float
|
|
||||||
Percentage of empty cells.
|
|
||||||
"""
|
|
||||||
empty_p = 0
|
empty_p = 0
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
r_nempty_cells, c_nempty_cells = [], []
|
||||||
for i in d:
|
for i in d:
|
||||||
|
|
@ -649,41 +334,11 @@ def count_empty(d):
|
||||||
|
|
||||||
|
|
||||||
def encode_(ar):
|
def encode_(ar):
|
||||||
"""Encodes list of text.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ar : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ar : list
|
|
||||||
"""
|
|
||||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
def get_text_objects(layout, ltype="char", t=None):
|
def get_text_objects(layout, ltype="char", t=None):
|
||||||
"""Recursively parses pdf layout to get a list of
|
|
||||||
text objects.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
layout : object
|
|
||||||
PDFMiner LTPage object.
|
|
||||||
|
|
||||||
ltype : string
|
|
||||||
{'char', 'lh', 'lv'}
|
|
||||||
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
|
||||||
and LTTextLineVertical objects respectively.
|
|
||||||
|
|
||||||
t : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
"""
|
|
||||||
if ltype == "char":
|
if ltype == "char":
|
||||||
LTObject = LTChar
|
LTObject = LTChar
|
||||||
elif ltype == "lh":
|
elif ltype == "lh":
|
||||||
|
|
@ -705,33 +360,6 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
|
|
||||||
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
detect_vertical=True, all_texts=True):
|
detect_vertical=True, all_texts=True):
|
||||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
|
||||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
|
||||||
of kwargs.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
pname : string
|
|
||||||
Path to pdf file.
|
|
||||||
|
|
||||||
char_margin : float
|
|
||||||
|
|
||||||
line_margin : float
|
|
||||||
|
|
||||||
word_margin : float
|
|
||||||
|
|
||||||
detect_vertical : bool
|
|
||||||
|
|
||||||
all_texts : bool
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
layout : object
|
|
||||||
PDFMiner LTPage object.
|
|
||||||
|
|
||||||
dim : tuple
|
|
||||||
pdf page dimension of the form (width, height).
|
|
||||||
"""
|
|
||||||
with open(pname, 'r') as f:
|
with open(pname, 'r') as f:
|
||||||
parser = PDFParser(f)
|
parser = PDFParser(f)
|
||||||
document = PDFDocument(parser)
|
document = PDFDocument(parser)
|
||||||
|
|
@ -755,16 +383,6 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
|
|
||||||
|
|
||||||
def merge_tuples(tuples):
|
def merge_tuples(tuples):
|
||||||
"""Merges a list of overlapping tuples.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
tuples : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
merged : list
|
|
||||||
"""
|
|
||||||
merged = list(tuples[0])
|
merged = list(tuples[0])
|
||||||
for s, e in tuples:
|
for s, e in tuples:
|
||||||
if s <= merged[1]:
|
if s <= merged[1]:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue