diff --git a/camelot/hybrid.py b/camelot/hybrid.py new file mode 100644 index 0000000..e69de29 diff --git a/camelot/imgproc.py b/camelot/imgproc.py new file mode 100644 index 0000000..cccb597 --- /dev/null +++ b/camelot/imgproc.py @@ -0,0 +1,98 @@ +import cv2 +import numpy as np + + +def adaptive_threshold(imagename, invert=False): + img = cv2.imread(imagename) + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + if invert: + threshold = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, + 15, -0.2) + else: + threshold = cv2.adaptiveThreshold( + np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + 15, -0.2) + return img, threshold + + +def find_lines(threshold, direction=None, scale=15): + lines = [] + + if direction == 'vertical': + size = threshold.shape[0] // scale + el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) + elif direction == 'horizontal': + size = threshold.shape[1] // scale + el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) + elif direction is None: + raise ValueError("Specify direction as either 'vertical' or" + " 'horizontal'") + + threshold = cv2.erode(threshold, el, (-1, -1)) + threshold = cv2.dilate(threshold, el, (-1, -1)) + + dmask = threshold # findContours modifies source image + + try: + _, contours, _ = cv2.findContours( + threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + contours, _ = cv2.findContours( + threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + for c in contours: + x, y, w, h = cv2.boundingRect(c) + x1, x2 = x, x + w + y1, y2 = y, y + h + if direction == 'vertical': + lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) + elif direction == 'horizontal': + lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + + return dmask, lines + + +def find_table_contours(vertical, horizontal): + mask = vertical + horizontal + + try: + __, contours, __ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + contours, __ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + + cont = [] + for c in contours: + c_poly = cv2.approxPolyDP(c, 3, True) + x, y, w, h = cv2.boundingRect(c_poly) + cont.append((x, y, w, h)) + return cont + + +def find_table_joints(contours, vertical, horizontal): + joints = np.bitwise_and(vertical, horizontal) + tables = {} + for c in contours: + x, y, w, h = c + roi = joints[y : y + h, x : x + w] + try: + __, jc, __ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + jc, __ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + if len(jc) <= 4: # remove contours with less than 4 joints + continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 + joint_coords.append((c1, c2)) + tables[(x, y + h, x + w, y)] = joint_coords + + return tables \ No newline at end of file diff --git a/camelot/lattice.py b/camelot/lattice.py index b31a1a1..92a0a45 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -4,15 +4,15 @@ import types import copy_reg import logging -import cv2 -import numpy as np - from wand.image import Image +from .imgproc import (adaptive_threshold, find_lines, find_table_contours, + find_table_joints) from .table import Table -from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values, - get_row_index, get_column_index, get_score, reduce_index, - outline, fill_spanning, count_empty, encode_list, pdf_to_text) +from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox, + detect_vertical, merge_close_values, get_row_index, + get_column_index, get_score, reduce_index, outline, + fill_spanning, count_empty, encode_list, pdf_to_text) __all__ = ['Lattice'] @@ -26,128 +26,6 @@ def _reduce_method(m): copy_reg.pickle(types.MethodType, _reduce_method) -def _morph_transform(imagename, scale=15, invert=False): - """Morphological Transformation - - Applies a series of morphological operations on the image - to find table contours and line segments. - http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ - - Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2 - taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf - - Parameters - ---------- - imagename : Path to image. - - scale : int - Scaling factor. Large scaling factor leads to smaller lines - being detected. (optional, default: 15) - - invert : bool - Invert pdf image to make sure that lines are in foreground. - (optional, default: False) - - Returns - ------- - img : ndarray - - tables : dict - Dictionary with table bounding box as key and list of - joints found in the table as value. - - v_segments : list - List of vertical line segments found in the image. - - h_segments : list - List of horizontal line segments found in the image. - """ - img = cv2.imread(imagename) - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - if invert: - threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, - 15, -0.2) - else: - threshold = cv2.adaptiveThreshold( - np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, - 15, -0.2) - - vertical = threshold - horizontal = threshold - - verticalsize = vertical.shape[0] // scale - horizontalsize = horizontal.shape[1] // scale - - ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) - - vertical = cv2.erode(vertical, ver, (-1, -1)) - vertical = cv2.dilate(vertical, ver, (-1, -1)) - - horizontal = cv2.erode(horizontal, hor, (-1, -1)) - horizontal = cv2.dilate(horizontal, hor, (-1, -1)) - - mask = vertical + horizontal - joints = np.bitwise_and(vertical, horizontal) - try: - __, contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - except ValueError: - contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] - - tables = {} - for c in contours: - c_poly = cv2.approxPolyDP(c, 3, True) - x, y, w, h = cv2.boundingRect(c_poly) - roi = joints[y : y + h, x : x + w] - try: - __, jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - except ValueError: - jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - if len(jc) <= 4: # remove contours with less than <=4 joints - continue - joint_coords = [] - for j in jc: - jx, jy, jw, jh = cv2.boundingRect(j) - c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 - joint_coords.append((c1, c2)) - tables[(x, y + h, x + w, y)] = joint_coords - - v_segments, h_segments = [], [] - try: - _, vcontours, _ = cv2.findContours( - vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - except ValueError: - vcontours, _ = cv2.findContours( - vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for vc in vcontours: - x, y, w, h = cv2.boundingRect(vc) - x1, x2 = x, x + w - y1, y2 = y, y + h - v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) - - try: - _, hcontours, _ = cv2.findContours( - horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - except ValueError: - hcontours, _ = cv2.findContours( - horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for hc in hcontours: - x, y, w, h = cv2.boundingRect(hc) - x1, x2 = x, x + w - y1, y2 = y, y + h - h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) - - return img, tables, v_segments, h_segments - - class Lattice: """Lattice algorithm @@ -188,17 +66,17 @@ class Lattice: Dictionary with page number as key and list of tables on that page as value. """ - - def __init__(self, fill=None, scale=15, jtol=2, mtol=2, - invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None): + def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15, + invert=False, margins=(2.0, 0.5, 0.1), debug=None): self.method = 'lattice' + self.table_area = table_area self.fill = fill - self.scale = scale self.jtol = jtol self.mtol = mtol + self.scale = scale self.invert = invert - self.char_margin, self.line_margin, self.word_margin = pdf_margin + self.char_margin, self.line_margin, self.word_margin = margins self.debug = debug def get_tables(self, pdfname): @@ -217,48 +95,79 @@ class Lattice: logging.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return None + imagename = ''.join([bname, '.png']) with Image(filename=pdfname, depth=8, resolution=300) as png: png.save(filename=imagename) + + img, threshold = adaptive_threshold(imagename, invert=self.invert) pdf_x = width pdf_y = height - img, table_bbox, v_segments, h_segments = _morph_transform( - imagename, scale=self.scale, invert=self.invert) img_x = img.shape[1] img_y = img.shape[0] - scaling_factor_x = pdf_x / float(img_x) - scaling_factor_y = pdf_y / float(img_y) + sc_x_image = img_x / float(pdf_x) + sc_y_image = img_y / float(pdf_y) + sc_x_pdf = pdf_x / float(img_x) + sc_y_pdf = pdf_y / float(img_y) + factors_image = (sc_x_image, sc_y_image, pdf_y) + factors_pdf = (sc_x_pdf, sc_y_pdf, img_y) + + vmask, v_segments = find_lines(threshold, direction='vertical', + scale=self.scale) + hmask, h_segments = find_lines(threshold, direction='horizontal', + scale=self.scale) + + if self.table_area: + if self.fill: + if len(self.table_area) != len(self.fill): + raise ValueError("message") + if len(self.jtol) == 1 and self.jtol[0] == 2: + self.jtol = self.jtol * len(self.table_area) + if len(self.mtol) == 1 and self.mtol[0] == 2: + self.mtol = self.mtol * len(self.table_area) + areas = [] + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = int(x1) + y1 = int(y1) + x2 = int(x2) + y2 = int(y2) + x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image) + areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + table_bbox = find_table_joints(areas, vmask, hmask) + else: + contours = find_table_contours(vmask, hmask) + table_bbox = find_table_joints(contours, vmask, hmask) if self.debug: self.debug_images = (img, table_bbox) - factors = (scaling_factor_x, scaling_factor_y, img_y) - table_bbox, v_segments, h_segments = transform(table_bbox, v_segments, - h_segments, factors) + table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments, + h_segments, factors_pdf) if self.debug: self.debug_segments = (v_segments, h_segments) self.debug_tables = [] - pdf_page = {} - page_tables = {} - table_no = 1 + page = {} + tables = {} + table_no = 0 # sort tables based on y-coord for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): - # select edges which lie within table_bbox - table_info = {} + # select elements which lie within table_bbox + table_data = {} v_s, h_s = segments_bbox(k, v_segments, h_segments) t_bbox = text_bbox(k, text) - table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) + table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) table_rotation = detect_vertical(t_bbox) cols, rows = zip(*table_bbox[k]) cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) rows.extend([k[1], k[3]]) # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=self.mtol) + cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) rows = merge_close_values( - sorted(rows, reverse=True), mtol=self.mtol) + sorted(rows, reverse=True), mtol=self.mtol[table_no]) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] @@ -266,9 +175,9 @@ class Lattice: for i in range(0, len(rows) - 1)] table = Table(cols, rows) # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, jtol=self.jtol) + table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no]) nouse = table.nocont_ / (len(v_s) + len(h_s)) - table_info['line_p'] = 100 * (1 - nouse) + table_data['line_p'] = 100 * (1 - nouse) # set spanning cells to True table = table.set_spanning() # set table border edges to True @@ -314,10 +223,10 @@ class Lattice: for t in t_bbox])) score = get_score([[50, rerror], [50, cerror]]) - table_info['score'] = score + table_data['score'] = score - if self.fill is not None: - table = fill_spanning(table, fill=self.fill) + if self.fill: + table = fill_spanning(table, fill=self.fill[table_no]) ar = table.get_list() if table_rotation == 'left': ar = zip(*ar[::-1]) @@ -325,18 +234,18 @@ class Lattice: ar = zip(*ar[::1]) ar.reverse() ar = encode_list(ar) - table_info['data'] = ar + table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_info['empty_p'] = empty_p - table_info['r_nempty_cells'] = r_nempty_cells - table_info['c_nempty_cells'] = c_nempty_cells - table_info['nrows'] = len(ar) - table_info['ncols'] = len(ar[0]) - page_tables['table_{0}'.format(table_no)] = table_info + table_data['empty_p'] = empty_p + table_data['r_nempty_cells'] = r_nempty_cells + table_data['c_nempty_cells'] = c_nempty_cells + table_data['nrows'] = len(ar) + table_data['ncols'] = len(ar[0]) + tables['table-{0}'.format(table_no + 1)] = table_data table_no += 1 - pdf_page[os.path.basename(bname)] = page_tables + page[os.path.basename(bname)] = tables if self.debug: return None - return pdf_page \ No newline at end of file + return page \ No newline at end of file diff --git a/camelot/stream.py b/camelot/stream.py index cb53713..8ec3dc9 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -7,7 +7,8 @@ import logging import numpy as np from .table import Table -from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text +from .utils import (get_row_index, get_score, count_empty, encode_list, + pdf_to_text, text_bbox) __all__ = ['Stream'] @@ -133,6 +134,17 @@ def _get_column_index(t, columns): return c_idx, error +def _join_rows(rows_grouped, text_y_max, text_y_min): + row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) + if len(r) > 0 else 0 for r in rows_grouped] + rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] + rows.insert(0, text_y_max) + rows.append(text_y_min) + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + return rows + + def _add_columns(cols, text, ytolerance): if text: text = _group_rows(text, ytol=ytolerance) @@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance): return cols -def _get_table_bounds(rows): - x0 = min([t.x0 for r in rows for t in r]) - x1 = max([t.x1 for r in rows for t in r]) - y0 = min([t.y0 for t in rows[-1]]) - y1 = max([t.y1 for t in rows[0]]) - return x0, x1, y0, y1 - - def _join_columns(cols, text_x_min, text_x_max): cols = sorted(cols) cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] @@ -194,16 +198,16 @@ class Stream: Dictionary with page number as key and list of tables on that page as value. """ - - def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2, - pdf_margin=(2.0, 0.5, 0.1), debug=False): + def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], + mtol=[2], margins=(2.0, 0.5, 0.1), debug=False): self.method = 'stream' - self.ncolumns = ncolumns + self.table_area = table_area self.columns = columns + self.ncolumns = ncolumns self.ytol = ytol self.mtol = mtol - self.char_margin, self.line_margin, self.word_margin = pdf_margin + self.char_margin, self.line_margin, self.word_margin = margins self.debug = debug def get_tables(self, pdfname): @@ -222,106 +226,126 @@ class Stream: logging.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return None - text.sort(key=lambda x: (-x.y0, x.x0)) if self.debug: self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text] return None - rows_grouped = _group_rows(text, ytol=self.ytol) - elements = [len(r) for r in rows_grouped] - row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) - if len(r) > 0 else 0 for r in rows_grouped] - rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] - bounds = _get_table_bounds(rows_grouped) - rows.insert(0, bounds[3]) - rows.append(bounds[2]) - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - - guess = False - if self.columns: - # user has to input boundary columns too - # take (0, width) by default - # similar to else condition - # len can't be 1 - cols = self.columns.split(',') - cols = [(float(cols[i]), float(cols[i + 1])) - for i in range(0, len(cols) - 1)] - else: + if self.table_area: + if self.columns: + if len(self.table_area) != len(self.columns): + raise ValueError("message") if self.ncolumns: - ncols = self.ncolumns - cols = [(t.x0, t.x1) - for r in rows_grouped if len(r) == ncols for t in r] - cols = _merge_columns(sorted(cols), mtol=self.mtol) - if len(cols) != self.ncolumns: - logging.warning("{}: The number of columns after merge" - " isn't the same as what you specified." - " Change the value of mtol.".format( - os.path.basename(bname))) - cols = _join_columns(cols, bounds[0], bounds[1]) - else: - guess = True - ncols = max(set(elements), key=elements.count) - len_non_mode = len(filter(lambda x: x != ncols, elements)) - if ncols == 1 and not self.debug: - # no tables detected - logging.warning("{}: Only one column was detected, the PDF" - " may have no tables. Specify ncols if" - " the PDF has tables.".format( - os.path.basename(bname))) - cols = [(t.x0, t.x1) - for r in rows_grouped if len(r) == ncols for t in r] - cols = _merge_columns(sorted(cols), mtol=self.mtol) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend([t for t in text if t.x0 > left and t.x1 < right]) - outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] - inner_text.extend(outer_text) - cols = _add_columns(cols, inner_text, self.ytol) - cols = _join_columns(cols, bounds[0], bounds[1]) - - pdf_page = {} - page_tables = {} - table_info = {} - table = Table(cols, rows) - rerror = [] - cerror = [] - for row in rows_grouped: - for t in row: - try: - r_idx, rass_error = get_row_index(t, rows) - except ValueError as e: - # couldn't assign LTTextLH to any cell - vprint(e.message) - continue - try: - c_idx, cass_error = _get_column_index(t, cols) - except ValueError as e: - # couldn't assign LTTextLH to any cell - vprint(e.message) - continue - rerror.append(rass_error) - cerror.append(cass_error) - table.cells[r_idx][c_idx].add_text( - t.get_text().strip('\n')) - if guess: - score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]]) + if len(self.table_area) != len(self.ncolumns): + raise ValueError("message") + if len(self.ytol) == 1 and self.ytol[0] == 2: + self.ytol = self.ytol * len(self.table_area) + if len(self.mtol) == 1 and self.mtol[0] == 2: + self.mtol = self.mtol * len(self.table_area) + table_bbox = {} + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = int(x1) + y1 = int(y1) + x2 = int(x2) + y2 = int(y2) + table_bbox[(x1, y2, x2, y1)] = None else: - score = get_score([[50, rerror], [50, cerror]]) - table_info['score'] = score - ar = table.get_list() - ar = encode_list(ar) - table_info['data'] = ar - empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_info['empty_p'] = empty_p - table_info['r_nempty_cells'] = r_nempty_cells - table_info['c_nempty_cells'] = c_nempty_cells - table_info['nrows'] = len(ar) - table_info['ncols'] = len(ar[0]) - page_tables['table_1'] = table_info - pdf_page[os.path.basename(bname)] = page_tables + table_bbox = {(0, height, width, 0): None} - return pdf_page \ No newline at end of file + page = {} + tables = {} + table_no = 0 + # sort tables based on y-coord + for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): + # select elements which lie within table_bbox + table_data = {} + t_bbox = text_bbox(k, text) + t_bbox.sort(key=lambda x: (-x.y0, x.x0)) + + rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no]) + rows = _join_rows(rows_grouped, k[3], k[1]) + elements = [len(r) for r in rows_grouped] + + guess = False + if self.columns and self.columns[table_no] != "": + # user has to input boundary columns too + # take (0, width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_no].split(',') + cols = [(float(cols[i]), float(cols[i + 1])) + for i in range(0, len(cols) - 1)] + else: + if self.ncolumns and self.ncolumns[table_no] != -1: + ncols = self.ncolumns[table_no] + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) + if len(cols) != self.ncolumns[table_no]: + logging.warning("{}: The number of columns after merge" + " isn't the same as what you specified." + " Change the value of mtol.".format( + os.path.basename(bname))) + cols = _join_columns(cols, k[0], k[2]) + else: + guess = True + ncols = max(set(elements), key=elements.count) + len_non_mode = len(filter(lambda x: x != ncols, elements)) + if ncols == 1 and not self.debug: + # no tables detected + logging.warning("{}: Only one column was detected, the PDF" + " may have no tables. Specify ncols if" + " the PDF has tables.".format( + os.path.basename(bname))) + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend([t for t in text if t.x0 > left and t.x1 < right]) + outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend(outer_text) + cols = _add_columns(cols, inner_text, self.ytol[table_no]) + cols = _join_columns(cols, k[0], k[2]) + + table = Table(cols, rows) + rerror = [] + cerror = [] + for row in rows_grouped: + for t in row: + try: + r_idx, rass_error = get_row_index(t, rows) + except ValueError as e: + # couldn't assign LTTextLH to any cell + continue + try: + c_idx, cass_error = _get_column_index(t, cols) + except ValueError as e: + # couldn't assign LTTextLH to any cell + continue + rerror.append(rass_error) + cerror.append(cass_error) + table.cells[r_idx][c_idx].add_text( + t.get_text().strip('\n')) + if guess: + score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]]) + else: + score = get_score([[50, rerror], [50, cerror]]) + + table_data['score'] = score + ar = encode_list(table.get_list()) + table_data['data'] = ar + empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) + table_data['empty_p'] = empty_p + table_data['r_nempty_cells'] = r_nempty_cells + table_data['c_nempty_cells'] = c_nempty_cells + table_data['nrows'] = len(ar) + table_data['ncols'] = len(ar[0]) + tables['table-{0}'.format(table_no + 1)] = table_data + table_no += 1 + page[os.path.basename(bname)] = tables + + return page \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index 99b7524..584c95c 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle): return xnew, ynew -def transform(tables, v_segments, h_segments, factors): +def scale_to_image(k, factors): + x1, y1, x2, y2 = k + scaling_factor_x, scaling_factor_y, pdf_y = factors + x1 = scale(x1, scaling_factor_x) + y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y) + x2 = scale(x2, scaling_factor_x) + y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y) + return int(x1), int(y1), int(x2), int(y2) + + +def scale_to_pdf(tables, v_segments, h_segments, factors): """Translates and scales OpenCV coordinates to PDFMiner coordinate space. diff --git a/tools/camelot b/tools/camelot index 4d01ee1..ee43b65 100755 --- a/tools/camelot +++ b/tools/camelot @@ -40,9 +40,9 @@ options: -W, --wmargin Word margin. Insert blank spaces between chars if distance between words is greater than word margin. [default: 0.1] - -S, --save-info Save parsing info for each page to a file. + -S, --print-stats List stats on the parsing process. + -T, --save-stats Save stats to a file. -X, --plot Plot distributions. (page,all,rc) - -Z, --summary Summarize metrics. camelot methods: lattice Looks for lines between data. @@ -55,19 +55,21 @@ lattice_doc = """ Lattice method looks for lines between text to form a table. usage: - camelot lattice [options] [--] + camelot lattice [-t ...] [-F ...] [-j ...] + [-m ...] [options] [--] options: + -t, --tarea Specific table areas to analyze. -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -i, --invert Invert pdf image to make sure that lines are - in foreground. -j, --jtol Tolerance to account for when comparing joint and line coordinates. [default: 2] -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] + -s, --scale Scaling factor. Large scaling factor leads to + smaller lines being detected. [default: 15] + -i, --invert Invert pdf image to make sure that lines are + in foreground. -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ @@ -76,12 +78,14 @@ stream_doc = """ Stream method looks for whitespaces between text to form a table. usage: - camelot stream [options] [--] + camelot stream [-t ...] [-c ...] [-n ...] [-y ...] + [-m ...] [options] [--] options: - -n, --ncols Number of columns. [default: 0] + -t, --tarea Specific table areas to analyze. -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 + -n, --ncols Number of columns. [default: -1] -y, --ytol Tolerance to account for when grouping rows together. [default: 2] -m, --mtol Tolerance to account for when merging columns @@ -166,7 +170,7 @@ def plot_rc_piechart(data, output): plt.savefig(''.join([output, '_rc.png']), dpi=300) -def summary(data, p_time): +def print_stats(data, p_time): from operator import itemgetter from itertools import groupby @@ -331,17 +335,18 @@ if __name__ == '__main__': else: p.append({'start': int(r), 'end': int(r)}) - margin_tuple = (float(args['--cmargin']), float(args['--lmargin']), + margins = (float(args['--cmargin']), float(args['--lmargin']), float(args['--wmargin'])) if args[''] == 'lattice': try: manager = Pdf(Lattice( + table_area=args['--tarea'], fill=args['--fill'], + jtol=[int(j) for j in args['--jtol']], + mtol=[int(m) for m in args['--mtol']], scale=int(args['--scale']), invert=args['--invert'], - jtol=int(args['--jtol']), - mtol=int(args['--mtol']), - pdf_margin=margin_tuple, + margins=margins, debug=args['--debug']), filename, pagenos=p, @@ -374,10 +379,10 @@ if __name__ == '__main__': if 'rc' in plot_type: plot_rc_piechart(data, pngname) - if args['--summary']: - summary(data, processing_time) + if args['--print-stats']: + print_stats(data, processing_time) - if args['--save-info']: + if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) with open(scorename, 'w') as score_file: @@ -402,11 +407,12 @@ if __name__ == '__main__': elif args[''] == 'stream': try: manager = Pdf(Stream( - ncolumns=int(args['--ncols']), + table_area=args['--tarea'], columns=args['--columns'], - ytol=int(args['--ytol']), - mtol=int(args['--mtol']), - pdf_margin=margin_tuple, + ncolumns=[int(nc) for nc in args['--ncols']], + ytol=[int(y) for y in args['--ytol']], + mtol=[int(m) for m in args['--mtol']], + margins=margins, debug=args['--debug']), filename, pagenos=p, @@ -439,10 +445,10 @@ if __name__ == '__main__': if 'rc' in plot_type: plot_rc_piechart(data, pngname) - if args['--summary']: - summary(data, processing_time) + if args['--print-stats']: + print_stats(data, processing_time) - if args['--save-info']: + if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) with open(scorename, 'w') as score_file: