diff --git a/camelot/image_processing.py b/camelot/image_processing.py
index 1621bea..fc284e4 100644
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@@ -8,35 +8,6 @@ from .utils import merge_tuples
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
- """Thresholds an image using OpenCV's adaptiveThreshold.
-
- Parameters
- ----------
- imagename : string
- Path to image file.
-
- invert : bool
- Whether or not to invert the image. Useful when pdfs have
- tables with lines in background.
- (optional, default: False)
-
- blocksize: int
- Size of a pixel neighborhood that is used to calculate a
- threshold value for the pixel: 3, 5, 7, and so on.
-
- c: float
- Constant subtracted from the mean or weighted mean
- (see the details below). Normally, it is positive but may be
- zero or negative as well.
-
- Returns
- -------
- img : object
- numpy.ndarray representing the original image.
-
- threshold : object
- numpy.ndarray representing the thresholded image.
- """
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@@ -50,38 +21,6 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
- """Finds horizontal and vertical lines by applying morphological
- transformations on an image.
-
- Parameters
- ----------
- threshold : object
- numpy.ndarray representing the thresholded image.
-
- direction : string
- Specifies whether to find vertical or horizontal lines.
- (default: 'horizontal')
-
- scale : int
- Used to divide the height/width to get a structuring element
- for morph transform.
- (optional, default: 15)
-
- iterations : int
- Number of iterations for dilation.
- (optional, default: 2)
-
- Returns
- -------
- dmask : object
- numpy.ndarray representing pixels where vertical/horizontal
- lines lie.
-
- lines : list
- List of tuples representing vertical/horizontal lines with
- coordinates relative to a left-top origin in
- OpenCV's coordinate space.
- """
lines = []
if direction == 'vertical':
@@ -118,23 +57,6 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
def find_table_contours(vertical, horizontal):
- """Finds table boundaries using OpenCV's findContours.
-
- Parameters
- ----------
- vertical : object
- numpy.ndarray representing pixels where vertical lines lie.
-
- horizontal : object
- numpy.ndarray representing pixels where horizontal lines lie.
-
- Returns
- -------
- cont : list
- List of tuples representing table boundaries. Each tuple is of
- the form (x, y, w, h) where (x, y) -> left-top, w -> width and
- h -> height in OpenCV's coordinate space.
- """
mask = vertical + horizontal
try:
@@ -154,30 +76,6 @@ def find_table_contours(vertical, horizontal):
def find_table_joints(contours, vertical, horizontal):
- """Finds joints/intersections present inside each table boundary.
-
- Parameters
- ----------
- contours : list
- List of tuples representing table boundaries. Each tuple is of
- the form (x, y, w, h) where (x, y) -> left-top, w -> width and
- h -> height in OpenCV's coordinate space.
-
- vertical : object
- numpy.ndarray representing pixels where vertical lines lie.
-
- horizontal : object
- numpy.ndarray representing pixels where horizontal lines lie.
-
- Returns
- -------
- tables : dict
- Dict with table boundaries as keys and list of intersections
- in that boundary as their value.
-
- Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
- and (x2, y2) -> rt in OpenCV's coordinate space.
- """
joints = np.bitwise_and(vertical, horizontal)
tables = {}
for c in contours:
@@ -202,23 +100,6 @@ def find_table_joints(contours, vertical, horizontal):
def remove_lines(threshold, line_scale=15):
- """Removes lines from a thresholded image.
-
- Parameters
- ----------
- threshold : object
- numpy.ndarray representing the thresholded image.
-
- line_scale : int
- Line scaling factor.
- (optional, default: 15)
-
- Returns
- -------
- threshold : object
- numpy.ndarray representing the thresholded image
- with horizontal and vertical lines removed.
- """
size = threshold.shape[0] // line_scale
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
@@ -236,22 +117,6 @@ def remove_lines(threshold, line_scale=15):
def find_cuts(threshold, char_scale=200):
- """Finds cuts made by text projections on y-axis.
-
- Parameters
- ----------
- threshold : object
- numpy.ndarray representing the thresholded image.
-
- char_scale : int
- Char scaling factor.
- (optional, default: 200)
-
- Returns
- -------
- y_cuts : list
- List of cuts on y-axis.
- """
size = threshold.shape[0] // char_scale
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
diff --git a/camelot/utils.py b/camelot/utils.py
index 650e62a..df82a8a 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -19,61 +19,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
def translate(x1, x2):
- """Translates x2 by x1.
-
- Parameters
- ----------
- x1 : float
-
- x2 : float
-
- Returns
- -------
- x2 : float
- """
x2 += x1
return x2
def scale(x, s):
- """Scales x by scaling factor s.
-
- Parameters
- ----------
- x : float
-
- s : float
-
- Returns
- -------
- x : float
- """
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
- """Rotates point x2, y2 about point x1, y1 by angle.
-
- Parameters
- ----------
- x1 : float
-
- y1 : float
-
- x2 : float
-
- y2 : float
-
- angle : float
- Angle in radians.
-
- Returns
- -------
- xnew : float
-
- ynew : float
- """
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
@@ -86,28 +41,6 @@ def rotate(x1, y1, x2, y2, angle):
def scale_to_image(k, factors):
- """Translates and scales PDFMiner coordinates to OpenCV's coordinate
- space.
-
- Parameters
- ----------
- k : tuple
- Tuple (x1, y1, x2, y2) representing table bounding box where
- (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
- space.
-
- factors : tuple
- Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
- first two elements are scaling factors and pdf_y is height of
- pdf.
-
- Returns
- -------
- knew : tuple
- Tuple (x1, y1, x2, y2) representing table bounding box where
- (x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
- space.
- """
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
@@ -119,34 +52,6 @@ def scale_to_image(k, factors):
def scale_to_pdf(tables, v_segments, h_segments, factors):
- """Translates and scales OpenCV coordinates to PDFMiner's coordinate
- space.
-
- Parameters
- ----------
- tables : dict
- Dict with table boundaries as keys and list of intersections
- in that boundary as their value.
-
- v_segments : list
- List of vertical line segments.
-
- h_segments : list
- List of horizontal line segments.
-
- factors : tuple
- Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
- first two elements are scaling factors and img_y is height of
- image.
-
- Returns
- -------
- tables_new : dict
-
- v_segments_new : dict
-
- h_segments_new : dict
- """
scaling_factor_x, scaling_factor_y, img_y = factors
tables_new = {}
for k in tables.keys():
@@ -179,12 +84,6 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
def setup_logging(log_filepath):
- """Setup logging
- Args:
- log_filepath (string): Path to log file
- Returns:
- logging.Logger: Logger object
- """
logger = logging.getLogger("app_logger")
logger.setLevel(logging.DEBUG)
# Log File Handler (Associating one log file per webservice run)
@@ -206,27 +105,6 @@ def setup_logging(log_filepath):
def get_rotation(lttextlh, lttextlv, ltchar):
- """Detects if text in table is vertical or not using the current
- transformation matrix (CTM) and returns its orientation.
-
- Parameters
- ----------
- lttextlh : list
- List of PDFMiner LTTextLineHorizontal objects.
-
- lttextlv : list
- List of PDFMiner LTTextLineVertical objects.
-
- ltchar : list
- List of PDFMiner LTChar objects.
-
- Returns
- -------
- rotation : string
- {'', 'left', 'right'}
- '' if text in table is upright, 'left' if rotated 90 degree
- anti-clockwise and 'right' if rotated 90 degree clockwise.
- """
rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
@@ -238,29 +116,6 @@ def get_rotation(lttextlh, lttextlv, ltchar):
def segments_bbox(bbox, v_segments, h_segments):
- """Returns all line segments present inside a
- table's bounding box.
-
- Parameters
- ----------
- bbox : tuple
- Tuple (x1, y1, x2, y2) representing table bounding box where
- (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
-
- v_segments : list
- List of vertical line segments.
-
- h_segments : list
- List of vertical horizontal segments.
-
- Returns
- -------
- v_s : list
- List of vertical line segments that lie inside table.
-
- h_s : list
- List of horizontal line segments that lie inside table.
- """
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
@@ -271,23 +126,6 @@ def segments_bbox(bbox, v_segments, h_segments):
def text_in_bbox(bbox, text):
- """Returns all text objects present inside a
- table's bounding box.
-
- Parameters
- ----------
- bbox : tuple
- Tuple (x1, y1, x2, y2) representing table bounding box where
- (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
-
- text : list
- List of PDFMiner text objects.
-
- Returns
- -------
- t_bbox : list
- List of PDFMiner text objects that lie inside table.
- """
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
@@ -297,20 +135,6 @@ def text_in_bbox(bbox, text):
def remove_close_values(ar, mtol=2):
- """Removes values which are within a tolerance of mtol of another value
- present in list.
-
- Parameters
- ----------
- ar : list
-
- mtol : int
- (optional, default: 2)
-
- Returns
- -------
- ret : list
- """
ret = []
for a in ar:
if not ret:
@@ -325,20 +149,6 @@ def remove_close_values(ar, mtol=2):
def merge_close_values(ar, mtol=2):
- """Merges values which are within a tolerance of mtol by calculating
- a moving mean.
-
- Parameters
- ----------
- ar : list
-
- mtol : int
- (optional, default: 2)
-
- Returns
- -------
- ret : list
- """
ret = []
for a in ar:
if not ret:
@@ -354,22 +164,6 @@ def merge_close_values(ar, mtol=2):
def flag_on_size(textline, direction):
- """Flags a super/subscript by enclosing it with . May give
- false positives.
-
- Parameters
- ----------
- textline : list
- List of PDFMiner LTChar objects.
-
- direction : string
- {'horizontal', 'vertical'}
- Direction of the PDFMiner LTTextLine object.
-
- Returns
- -------
- fstring : string
- """
if direction == 'horizontal':
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
elif direction == 'vertical':
@@ -396,33 +190,6 @@ def flag_on_size(textline, direction):
def split_textline(table, textline, direction, flag_size=True):
- """Splits PDFMiner LTTextLine into substrings if it spans across
- multiple rows/columns.
-
- Parameters
- ----------
- table : object
- camelot.pdf.Pdf
-
- textline : object
- PDFMiner LTTextLine object.
-
- direction : string
- {'horizontal', 'vertical'}
- Direction of the PDFMiner LTTextLine object.
-
- flag_size : bool
- Whether or not to highlight a substring using
- if its size is different from rest of the string, useful for
- super and subscripts.
- (optional, default: True)
-
- Returns
- -------
- grouped_chars : list
- List of tuples of the form (idx, text) where idx is the index
- of row/column and text is the an lttextline substring.
- """
idx = 0
cut_text = []
bbox = textline.bbox
@@ -474,47 +241,6 @@ def split_textline(table, textline, direction, flag_size=True):
def get_table_index(table, t, direction, split_text=False, flag_size=True):
- """Gets indices of the cell where given text object lies by
- comparing their y and x-coordinates.
-
- Parameters
- ----------
- table : object
- camelot.table.Table
-
- t : object
- PDFMiner LTTextLine object.
-
- direction : string
- {'horizontal', 'vertical'}
- Direction of the PDFMiner LTTextLine object.
-
- split_text : bool
- Whether or not to split a text line if it spans across
- multiple cells.
- (optional, default: False)
-
- flag_size : bool
- Whether or not to highlight a substring using
- if its size is different from rest of the string, useful for
- super and subscripts.
- (optional, default: True)
-
- Returns
- -------
- indices : list
- List of tuples of the form (idx, text) where idx is the index
- of row/column and text is the an lttextline substring.
-
- error : float
- Assignment error, percentage of text area that lies outside
- a cell.
- +-------+
- | |
- | [Text bounding box]
- | |
- +-------+
- """
r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)):
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
@@ -558,20 +284,6 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
def compute_accuracy(error_weights):
- """Calculates score based on weights assigned to various parameters,
- and their error percentages.
-
- Parameters
- ----------
- error_weights : list
- Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
- where pn is the weight assigned to list of errors en.
- Sum of pn should be equal to 100.
-
- Returns
- -------
- score : float
- """
SCORE_VAL = 100
try:
score = 0
@@ -587,16 +299,6 @@ def compute_accuracy(error_weights):
def remove_empty(d):
- """Removes empty rows and columns from a two-dimensional list.
-
- Parameters
- ----------
- d : list
-
- Returns
- -------
- d : list
- """
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
@@ -607,23 +309,6 @@ def remove_empty(d):
def count_empty(d):
- """Counts empty rows and columns in a two-dimensional list.
-
- Parameters
- ----------
- d : list
-
- Returns
- -------
- n_empty_rows : list
- Number of empty rows.
-
- n_empty_cols : list
- Number of empty columns.
-
- empty_p : float
- Percentage of empty cells.
- """
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
@@ -649,41 +334,11 @@ def count_empty(d):
def encode_(ar):
- """Encodes list of text.
-
- Parameters
- ----------
- ar : list
-
- Returns
- -------
- ar : list
- """
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar
def get_text_objects(layout, ltype="char", t=None):
- """Recursively parses pdf layout to get a list of
- text objects.
-
- Parameters
- ----------
- layout : object
- PDFMiner LTPage object.
-
- ltype : string
- {'char', 'lh', 'lv'}
- Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
- and LTTextLineVertical objects respectively.
-
- t : list
-
- Returns
- -------
- t : list
- List of PDFMiner text objects.
- """
if ltype == "char":
LTObject = LTChar
elif ltype == "lh":
@@ -705,33 +360,6 @@ def get_text_objects(layout, ltype="char", t=None):
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
- """Returns a PDFMiner LTPage object and page dimension of a single
- page pdf. See https://euske.github.io/pdfminer/ to get definitions
- of kwargs.
-
- Parameters
- ----------
- pname : string
- Path to pdf file.
-
- char_margin : float
-
- line_margin : float
-
- word_margin : float
-
- detect_vertical : bool
-
- all_texts : bool
-
- Returns
- -------
- layout : object
- PDFMiner LTPage object.
-
- dim : tuple
- pdf page dimension of the form (width, height).
- """
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
@@ -755,16 +383,6 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
def merge_tuples(tuples):
- """Merges a list of overlapping tuples.
-
- Parameters
- ----------
- tuples : list
-
- Returns
- -------
- merged : list
- """
merged = list(tuples[0])
for s, e in tuples:
if s <= merged[1]: