diff --git a/camelot/imgproc.py b/camelot/imgproc.py index eba296b..24abac3 100644 --- a/camelot/imgproc.py +++ b/camelot/imgproc.py @@ -1,3 +1,6 @@ +from itertools import groupby +from operator import itemgetter + import cv2 import numpy as np @@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', scale=15): +def find_lines(threshold, direction='horizontal', scale=15, iterations=2): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15): for morph transform. (optional, default: 15) + iterations : int + Number of iterations for dilation. + (optional, default: 2) + Returns ------- dmask : object @@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15): raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'") - threshold = cv2.erode(threshold, el, (-1, -1)) - threshold = cv2.dilate(threshold, el, (-1, -1)) - - dmask = threshold # findContours modifies source image + threshold = cv2.erode(threshold, el) + threshold = cv2.dilate(threshold, el) + dmask = cv2.dilate(threshold, el, iterations=iterations) try: _, contours, _ = cv2.findContours( @@ -190,4 +196,33 @@ def find_table_joints(contours, vertical, horizontal): joint_coords.append((c1, c2)) tables[(x, y + h, x + w, y)] = joint_coords - return tables \ No newline at end of file + return tables + + +def find_cuts(threshold, line_threshold=100): + """find_cuts + + Parameters + ---------- + threshold : object + numpy.ndarray representing the thresholded image. + + line_threshold : int + Maximum intensity of projections on y-axis. + (optional, default: 100) + + Returns + ------- + y_cuts : list + List of cuts on y-axis. + """ + y_proj = np.sum(threshold, axis=1) + y_proj_less = np.where(y_proj < line_threshold)[0] + ranges = [] + for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x): + group = map(itemgetter(1), g) + ranges.append((group[0], group[-1])) + y_cuts = [] + for r in ranges: + y_cuts.append((r[0] + r[1]) / 2) + return sorted(y_cuts, reverse=True) \ No newline at end of file diff --git a/camelot/lattice.py b/camelot/lattice.py index 2e82222..3a60a73 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .table import Table from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, merge_close_values, get_table_index, get_score, count_empty, - encode_list, get_text_objects, get_page_layout) + encode_list, get_text_objects, get_page_layout, remove_empty) __all__ = ['Lattice'] @@ -131,20 +131,20 @@ class Lattice: direction. (optional, default: None) - headers : list - List of strings where each string is a csv header for a table. - (optional, default: None) - mtol : list List of ints specifying m-tolerance parameters. (optional, default: [2]) - blocksize: int + jtol : list + List of ints specifying j-tolerance parameters. + (optional, default: [2]) + + blocksize : int Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. (optional, default: 15) - threshold_constant: float + threshold_constant : float Constant subtracted from the mean or weighted mean (see the details below). Normally, it is positive but may be zero or negative as well. @@ -155,6 +155,10 @@ class Lattice: element for image processing. (optional, default: 15) + iterations : int + Number of iterations for dilation. + (optional, default: 2) + invert : bool Whether or not to invert the image. Useful when pdfs have tables with lines in background. @@ -187,19 +191,20 @@ class Lattice: of detected contours, lines, joints and the table generated. (optional, default: None) """ - def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], - blocksize=15, threshold_constant=-2, scale=15, invert=False, - margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, - shift_text=['l', 't'], debug=None): + def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2], + blocksize=15, threshold_constant=-2, scale=15, iterations=2, + invert=False, margins=(1.0, 0.5, 0.1), split_text=False, + flag_size=True, shift_text=['l', 't'], debug=None): self.method = 'lattice' self.table_area = table_area self.fill = fill - self.headers = headers self.mtol = mtol + self.jtol = jtol self.blocksize = blocksize self.threshold_constant = threshold_constant self.scale = scale + self.iterations = iterations self.invert = invert self.char_margin, self.line_margin, self.word_margin = margins self.split_text = split_text @@ -257,17 +262,14 @@ class Lattice: factors_pdf = (sc_x_pdf, sc_y_pdf, img_y) vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale) + scale=self.scale, iterations=self.iterations) hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale) + scale=self.scale, iterations=self.iterations) if self.table_area is not None: if self.fill is not None: if len(self.table_area) != len(self.fill): - raise ValueError("Length of fill should be equal to table_area.") - if self.headers is not None: - if len(self.table_area) != len(self.headers): - raise ValueError("Length of headers should be equal to table_area.") + raise ValueError("Length of table area and fill should be equal.") areas = [] for area in self.table_area: @@ -288,6 +290,11 @@ class Lattice: else: mtolerance = copy.deepcopy(self.mtol) + if len(self.jtol) == 1 and self.jtol[0] == 2: + jtolerance = copy.deepcopy(self.jtol) * len(table_bbox) + else: + jtolerance = copy.deepcopy(self.jtol) + if self.debug: self.debug_images = (img, table_bbox) @@ -326,18 +333,9 @@ class Lattice: rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] - if self.headers is not None and self.headers[table_no] != [""]: - self.headers[table_no] = self.headers[table_no].split(',') - if len(self.headers[table_no]) != len(cols): - logger.warning("Length of header ({0}) specified for table is not" - " equal to the number of columns ({1}) detected.".format( - len(self.headers[table_no]), len(cols))) - while len(self.headers[table_no]) != len(cols): - self.headers[table_no].append('') - table = Table(cols, rows) # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s) + table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no]) nouse = table.nocont_ / (len(v_s) + len(h_s)) table_data['line_p'] = 100 * (1 - nouse) # set spanning cells to True @@ -351,27 +349,27 @@ class Lattice: assignment_errors = [] table_data['split_text'] = [] table_data['superscript'] = [] - for direction in t_bbox: + for direction in ['vertical', 'horizontal']: for t in t_bbox[direction]: indices, error = get_table_index( table, t, direction, split_text=self.split_text, flag_size=self.flag_size) - assignment_errors.append(error) - indices = _reduce_index(table, indices, shift_text=self.shift_text,) - if len(indices) > 1: - table_data['split_text'].append(indices) - for r_idx, c_idx, text in indices: - if all(s in text for s in ['', '']): - table_data['superscript'].append((r_idx, c_idx, text)) - table.cells[r_idx][c_idx].add_text(text) + if indices[:2] != (-1, -1): + assignment_errors.append(error) + indices = _reduce_index(table, indices, shift_text=self.shift_text) + if len(indices) > 1: + table_data['split_text'].append(indices) + for r_idx, c_idx, text in indices: + if all(s in text for s in ['', '']): + table_data['superscript'].append((r_idx, c_idx, text)) + table.cells[r_idx][c_idx].add_text(text) score = get_score([[100, assignment_errors]]) table_data['score'] = score if self.fill is not None: table = _fill_spanning(table, fill=self.fill[table_no]) ar = table.get_list() - if self.headers is not None and self.headers[table_no] != ['']: - ar.insert(0, self.headers[table_no]) + ar = remove_empty(ar) ar = encode_list(ar) table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) diff --git a/camelot/ocr.py b/camelot/ocr.py index 6d56cdf..4ce2791 100644 --- a/camelot/ocr.py +++ b/camelot/ocr.py @@ -7,19 +7,18 @@ from PIL import Image from .table import Table from .imgproc import (adaptive_threshold, find_lines, find_table_contours, - find_table_joints) -from .utils import merge_close_values, encode_list + find_table_joints, find_cuts) +from .utils import merge_close_values, encode_list, remove_empty -class OCR: - """Uses optical character recognition to get text out of image based pdfs. - Currently works only on pdfs with lines. +class OCRLattice: + """Lattice, but for images. Parameters ---------- table_area : list List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's + (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's coordinate space, denoting table areas to analyze. (optional, default: None) @@ -27,12 +26,12 @@ class OCR: List of ints specifying m-tolerance parameters. (optional, default: [2]) - blocksize: int + blocksize : int Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. (optional, default: 15) - threshold_constant: float + threshold_constant : float Constant subtracted from the mean or weighted mean (see the details below). Normally, it is positive but may be zero or negative as well. @@ -51,6 +50,10 @@ class OCR: element for image processing. (optional, default: 15) + iterations : int + Number of iterations for dilation. + (optional, default: 2) + debug : string {'contour', 'line', 'joint', 'table'} Set to one of the above values to generate a matplotlib plot @@ -58,9 +61,9 @@ class OCR: (optional, default: None) """ def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, - dpi=300, lang="eng", scale=15, debug=None): + dpi=300, lang="eng", scale=15, iterations=2, debug=None): - self.method = 'ocr' + self.method = 'ocrl' self.table_area = table_area self.mtol = mtol self.blocksize = blocksize @@ -69,11 +72,13 @@ class OCR: self.dpi = dpi self.lang = lang self.scale = scale + self.iterations = iterations self.debug = debug def get_tables(self, pdfname): if self.tool is None: return None + bname, __ = os.path.splitext(pdfname) imagename = ''.join([bname, '.png']) @@ -91,9 +96,9 @@ class OCR: img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, c=self.threshold_constant) vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale) + scale=self.scale, iterations=self.iterations) hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale) + scale=self.scale, iterations=self.iterations) if self.table_area is not None: areas = [] @@ -154,6 +159,7 @@ class OCR: ar = table.get_list() ar.reverse() ar = encode_list(ar) + ar = remove_empty(ar) table_data['data'] = ar tables['table-{0}'.format(table_no + 1)] = table_data table_no += 1 @@ -162,4 +168,142 @@ class OCR: if self.debug: return None + return page + + +class OCRStream: + """Stream, but for images. + + Parameters + ---------- + table_area : list + List of strings of the form x1,y1,x2,y2 where + (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's + coordinate space, denoting table areas to analyze. + (optional, default: None) + + columns : list + List of strings where each string is comma-separated values of + x-coordinates in OpenCV's coordinate space. + (optional, default: None) + + blocksize : int + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + (optional, default: 15) + + threshold_constant : float + Constant subtracted from the mean or weighted mean + (see the details below). Normally, it is positive but may be + zero or negative as well. + (optional, default: -2) + + line_threshold : int + Maximum intensity of projections on y-axis. + (optional, default: 100) + + dpi : int + Dots per inch. + (optional, default: 300) + + lang : string + Language to be used for OCR. + (optional, default: 'eng') + """ + def __init__(self, table_area=None, columns=None, blocksize=15, + threshold_constant=-2, line_threshold=100, dpi=300, lang="eng", + debug=False): + + self.method = 'ocrs' + self.table_area = table_area + self.columns = columns + self.blocksize = blocksize + self.threshold_constant = threshold_constant + self.line_threshold = line_threshold + self.tool = pyocr.get_available_tools()[0] # fix this + self.dpi = dpi + self.lang = lang + self.debug = debug + + def get_tables(self, pdfname): + if self.tool is None: + return None + + bname, __ = os.path.splitext(pdfname) + imagename = ''.join([bname, '.png']) + + gs_call = [ + "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), + pdfname + ] + if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): + gs_call.insert(0, "gs") + else: + gs_call.insert(0, "gsc") + subprocess.call(gs_call, stdout=open(os.devnull, 'w'), + stderr=subprocess.STDOUT) + + img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, + c=self.threshold_constant) + height, width = threshold.shape + if self.debug: + self.debug_images = img + return None + + if self.table_area is not None: + if self.columns is not None: + if len(self.table_area) != len(self.columns): + raise ValueError("Length of table area and columns should be equal.") + + table_bbox = {} + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = int(x1) + y1 = int(y1) + x2 = int(x2) + y2 = int(y2) + table_bbox[(x1, y1, x2, y2)] = None + else: + table_bbox = {(0, 0, width, height): None} + + page = {} + tables = {} + table_no = 0 + for k in sorted(table_bbox.keys(), key=lambda x: x[1]): + if self.columns is None: + raise NotImplementedError + else: + table_data = {} + table_image = threshold[k[1]:k[3],k[0]:k[2]] + cols = self.columns[table_no].split(',') + cols = [float(c) for c in cols] + cols.insert(0, k[0]) + cols.append(k[2]) + cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] + y_cuts = find_cuts(table_image, line_threshold=self.line_threshold) + rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] + table = Table(cols, rows) + for i in range(len(table.cells)): + for j in range(len(table.cells[i])): + x1 = int(table.cells[i][j].x1) + y1 = int(table.cells[i][j].y1) + x2 = int(table.cells[i][j].x2) + y2 = int(table.cells[i][j].y2) + table.cells[i][j].image = table_image[y1:y2,x1:x2] + cell_image = Image.fromarray(table.cells[i][j].image) + text = self.tool.image_to_string( + cell_image, + lang=self.lang, + builder=pyocr.builders.TextBuilder() + ) + table.cells[i][j].add_text(text) + ar = table.get_list() + ar.reverse() + ar = encode_list(ar) + ar = remove_empty(ar) + table_data['data'] = ar + tables['table-{0}'.format(table_no + 1)] = table_data + table_no += 1 + page[os.path.basename(bname)] = tables + return page \ No newline at end of file diff --git a/camelot/pdf.py b/camelot/pdf.py index f9ca4eb..7c6ae0f 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -141,11 +141,14 @@ class Pdf: if self.extractor.method == 'stream': self.debug = self.extractor.debug self.debug_text = [] - elif self.extractor.method in ['lattice', 'ocr']: + elif self.extractor.method in ['lattice', 'ocrl']: self.debug = self.extractor.debug self.debug_images = [] self.debug_segments = [] self.debug_tables = [] + elif self.extractor.method == 'ocrs': + self.debug = self.extractor.debug + self.debug_images = [] for p in pages: table = self.extractor.get_tables(p) if table is not None: @@ -157,6 +160,8 @@ class Pdf: self.debug_images.append(self.extractor.debug_images) self.debug_segments.append(self.extractor.debug_segments) self.debug_tables.append(self.extractor.debug_tables) + elif self.extractor.method == 'ocrs': + self.debug_images.append(self.extractor.debug_images) if self.clean: self.remove_tempdir() return tables @@ -175,7 +180,7 @@ class Pdf: import matplotlib.patches as patches if self.debug is True: - try: + if hasattr(self, 'debug_text'): for text in self.debug_text: fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') @@ -193,8 +198,10 @@ class Pdf: ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) plt.show() - except AttributeError: - raise ValueError("This option only be used with Stream.") + elif hasattr(self, 'debug_images'): + for img in self.debug_images: + plt.imshow(img) + plt.show() elif self.debug == 'contour': try: for img, table_bbox in self.debug_images: diff --git a/camelot/stream.py b/camelot/stream.py index 95c99d3..96dfedd 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -236,10 +236,6 @@ class Stream: x-coordinates in PDFMiner's coordinate space. (optional, default: None) - headers : list - List of strings where each string is a csv header for a table. - (optional, default: None) - ytol : list List of ints specifying the y-tolerance parameters. (optional, default: [2]) @@ -268,14 +264,13 @@ class Stream: LTTextLineHorizontals in order to select table_area, columns. (optional, default: False) """ - def __init__(self, table_area=None, columns=None, headers=None, - ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), - split_text=False, flag_size=True, debug=False): + def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0], + margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, + debug=False): self.method = 'stream' self.table_area = table_area self.columns = columns - self.headers = headers self.ytol = ytol self.mtol = mtol self.char_margin, self.line_margin, self.word_margin = margins @@ -312,14 +307,12 @@ class Stream: self.debug_text = [] self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) + return None if self.table_area is not None: if self.columns is not None: if len(self.table_area) != len(self.columns): - raise ValueError("Length of columns should be equal to table_area.") - if self.headers is not None: - if len(self.table_area) != len(self.headers): - raise ValueError("Length of headers should be equal to table_area.") + raise ValueError("Length of table area and columns should be equal.") table_bbox = {} for area in self.table_area: @@ -336,6 +329,7 @@ class Stream: ytolerance = copy.deepcopy(self.ytol) * len(table_bbox) else: ytolerance = copy.deepcopy(self.ytol) + if len(self.mtol) == 1 and self.mtol[0] == 0: mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) else: @@ -374,7 +368,7 @@ class Stream: guess = True ncols = max(set(elements), key=elements.count) len_non_mode = len(filter(lambda x: x != ncols, elements)) - if ncols == 1 and not self.debug: + if ncols == 1: # no tables detected logger.warning("{}: Only one column was detected, the pdf" " may have no tables.".format( @@ -396,15 +390,6 @@ class Stream: cols = _add_columns(cols, inner_text, ytolerance[table_no]) cols = _join_columns(cols, text_x_min, text_x_max) - if self.headers is not None and self.headers[table_no] != [""]: - self.headers[table_no] = self.headers[table_no].split(',') - if len(self.headers[table_no]) != len(cols): - logger.warning("Length of header ({0}) specified for table is not" - " equal to the number of columns ({1}) detected.".format( - len(self.headers[table_no]), len(cols))) - while len(self.headers[table_no]) != len(cols): - self.headers[table_no].append('') - table = Table(cols, rows) table = table.set_all_edges() assignment_errors = [] @@ -429,8 +414,6 @@ class Stream: table_data['score'] = score ar = table.get_list() - if self.headers is not None and self.headers[table_no] != ['']: - ar.insert(0, self.headers[table_no]) ar = encode_list(ar) table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) diff --git a/camelot/table.py b/camelot/table.py index 8549300..fc1a45e 100644 --- a/camelot/table.py +++ b/camelot/table.py @@ -188,38 +188,32 @@ class Table: bound = self.cells[r][c].get_bounded_edges() if bound == 4: continue - elif bound == 3: if not self.cells[r][c].left: if (self.cells[r][c].right and self.cells[r][c].top and self.cells[r][c].bottom): self.cells[r][c].spanning_h = True - elif not self.cells[r][c].right: if (self.cells[r][c].left and self.cells[r][c].top and self.cells[r][c].bottom): self.cells[r][c].spanning_h = True - elif not self.cells[r][c].top: if (self.cells[r][c].left and self.cells[r][c].right and self.cells[r][c].bottom): self.cells[r][c].spanning_v = True - elif not self.cells[r][c].bottom: if (self.cells[r][c].left and self.cells[r][c].right and self.cells[r][c].top): self.cells[r][c].spanning_v = True - elif bound == 2: if self.cells[r][c].left and self.cells[r][c].right: if (not self.cells[r][c].top and not self.cells[r][c].bottom): self.cells[r][c].spanning_v = True - elif self.cells[r][c].top and self.cells[r][c].bottom: if (not self.cells[r][c].left and not self.cells[r][c].right): diff --git a/camelot/utils.py b/camelot/utils.py index d128740..c5e958c 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True): idx = 0 cut_text = [] bbox = textline.bbox - if direction == 'horizontal' and not textline.is_empty(): - x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] - r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] - r = r_idx[0] - x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] - if not x_cuts: - x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] - for obj in textline._objs: - row = table.rows[r] - for cut in x_cuts: - if isinstance(obj, LTChar): - if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and - (obj.x0 + obj.x1) / 2 <= cut[1]): + try: + if direction == 'horizontal' and not textline.is_empty(): + x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] + r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] + r = r_idx[0] + x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] + if not x_cuts: + x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] + for obj in textline._objs: + row = table.rows[r] + for cut in x_cuts: + if isinstance(obj, LTChar): + if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and + (obj.x0 + obj.x1) / 2 <= cut[1]): + cut_text.append((r, cut[0], obj)) + break + elif isinstance(obj, LTAnno): cut_text.append((r, cut[0], obj)) - break - elif isinstance(obj, LTAnno): - cut_text.append((r, cut[0], obj)) - elif direction == 'vertical' and not textline.is_empty(): - y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] - c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] - c = c_idx[0] - y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] - if not y_cuts: - y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] - for obj in textline._objs: - col = table.cols[c] - for cut in y_cuts: - if isinstance(obj, LTChar): - if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and - (obj.y0 + obj.y1) / 2 >= cut[1]): + elif direction == 'vertical' and not textline.is_empty(): + y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] + c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] + c = c_idx[0] + y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] + if not y_cuts: + y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] + for obj in textline._objs: + col = table.cols[c] + for cut in y_cuts: + if isinstance(obj, LTChar): + if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and + (obj.y0 + obj.y1) / 2 >= cut[1]): + cut_text.append((cut[0], c, obj)) + break + elif isinstance(obj, LTAnno): cut_text.append((cut[0], c, obj)) - break - elif isinstance(obj, LTAnno): - cut_text.append((cut[0], c, obj)) + except IndexError: + return [(-1, -1, textline.get_text())] grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: diff --git a/tools/camelot b/tools/camelot index ea9d396..54a6de1 100755 --- a/tools/camelot +++ b/tools/camelot @@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream -from camelot.ocr import OCR +from camelot.ocr import OCRLattice, OCRStream from camelot import utils @@ -54,7 +54,8 @@ options: camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. - ocr Looks for lines in image based pdfs. + ocrl Lattice, but for images. + ocrs Stream, but for images. See 'camelot -h' for more information on a specific method. """ @@ -63,20 +64,22 @@ lattice_doc = """ Lattice method looks for lines between text to form a table. usage: - camelot lattice [-t ...] [-F ...] [-H
...] - [-m ...] [options] [--] + camelot lattice [-t ...] [-F ...] [-m ...] + [-j ...] [options] [--] options: -t, --tarea Specific table areas to analyze. -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv - -H, --header
Specify header for each table. -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] + -j, --jtol Tolerance to account for when matching line endings + with intersections. [default: 2] -b, --blocksize See adaptive threshold doc. [default: 15] - -c, --constant See adaptive threshold doc. [default: -2] + -C, --constant See adaptive threshold doc. [default: -2] -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] + -I, --iterations Number of iterations for dilation. [default: 2] -i, --invert Invert pdf image to make sure that lines are in foreground. -T, --shift_text Specify where the text in a spanning cell @@ -89,41 +92,61 @@ stream_doc = """ Stream method looks for whitespaces between text to form a table. usage: - camelot stream [-t ...] [-c ...] [-H
...] - [-y ...] [-m ...] [options] [--] + camelot stream [-t ...] [-c ...] [-m ...] + [-y ...] [options] [--] options: -t, --tarea Specific table areas to analyze. -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 - -H, --header
Specify header for each table. - -y, --ytol Tolerance to account for when grouping rows - together. [default: 2] -m, --mtol Tolerance to account for when merging columns together. [default: 0] + -y, --ytol Tolerance to account for when grouping rows + together. [default: 2] -d, --debug Debug by visualizing textboxes. """ -ocr_doc = """ -OCR method looks for lines in image based pdfs. +ocrl_doc = """ +Lattice, but for images. usage: - camelot ocr [-t ] [-m ] [options] [--] + camelot ocrl [-t ...] [-m ...] [options] [--] options: - -t, --tarea Specific table areas to analyze. - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -b, --blocksize See adaptive threshold doc. [default: 15] - -c, --constant See adaptive threshold doc. [default: -2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -l, --lang Specify language to be used for OCR. [default: eng] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table + -t, --tarea Specific table areas to analyze. + -m, --mtol Tolerance to account for when merging lines + which are very close. [default: 2] + -b, --blocksize See adaptive threshold doc. [default: 15] + -C, --constant See adaptive threshold doc. [default: -2] + -D, --dpi Dots per inch, specify image quality to be used for OCR. + [default: 300] + -l, --lang Specify language to be used for OCR. [default: eng] + -s, --scale Scaling factor. Large scaling factor leads to + smaller lines being detected. [default: 15] + -I, --iterations Number of iterations for dilation. [default: 2] + -d, --debug Debug by visualizing pdf geometry. + (contour,line,joint,table) Example: -d table +""" + +ocrs_doc = """ +Stream, but for images. + +usage: + camelot ocrs [-t ...] [-c ...] [options] [--] + +options: + -t, --tarea Specific table areas to analyze. + -c, --columns Comma-separated list of column x-coordinates. + Example: -c 10.1,20.2,30.3 + -b, --blocksize See adaptive threshold doc. [default: 15] + -C, --constant See adaptive threshold doc. [default: -2] + -N, --line-threshold Maximum intensity of projections on y-axis. + [default: 100] + -D, --dpi Dots per inch, specify image quality to be used for OCR. + [default: 300] + -l, --lang Specify language to be used for OCR. [default: eng] + -d, --debug Debug by visualizing image. """ @@ -351,8 +374,10 @@ if __name__ == '__main__': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) - elif args[''] == 'ocr': - args.update(docopt(ocr_doc, argv=argv)) + elif args[''] == 'ocrl': + args.update(docopt(ocrl_doc, argv=argv)) + elif args[''] == 'ocrs': + args.update(docopt(ocrs_doc, argv=argv)) filename = args[''] filedir = os.path.dirname(args['']) @@ -392,11 +417,12 @@ if __name__ == '__main__': kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, 'fill': args['--fill'] if args['--fill'] else None, - 'headers': args['--header'] if args['--header'] else None, 'mtol': [int(m) for m in args['--mtol']], + 'jtol': [int(j) for j in args['--jtol']], 'blocksize': int(args['--blocksize']), 'threshold_constant': float(args['--constant']), 'scale': int(args['--scale']), + 'iterations': int(args['--iterations']), 'invert': args['--invert'], 'margins': margins, 'split_text': args['--split_text'], @@ -462,7 +488,6 @@ if __name__ == '__main__': kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, 'columns': args['--columns'] if args['--columns'] else None, - 'headers': args['--header'] if args['--header'] else None, 'ytol': [int(y) for y in args['--ytol']], 'mtol': [int(m) for m in args['--mtol']], 'margins': margins, @@ -522,7 +547,7 @@ if __name__ == '__main__': except Exception as e: logger.exception(e.message, exc_info=True) sys.exit() - elif args[''] == 'ocr': + elif args[''] == 'ocrl': try: kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, @@ -532,9 +557,75 @@ if __name__ == '__main__': 'dpi': int(args['--dpi']), 'lang': args['--lang'], 'scale': int(args['--scale']), + 'iterations': int(args['--iterations']), 'debug': args['--debug'] } - manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True, + manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True, + parallel=args['--parallel']) + data = manager.extract() + + processing_time = time.time() - start_time + logger.info("Finished processing in " + str(processing_time) + " seconds") + + if args['--plot']: + if args['--output']: + pngname = os.path.join(args['--output'], os.path.basename(pngname)) + plot_type = args['--plot'].split(',') + if 'page' in plot_type: + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + plot_table_barchart(table['r_nempty_cells'], + table['c_nempty_cells'], + table['empty_p'], + page_number, + table_number) + + if 'all' in plot_type: + plot_all_barchart(data, pngname) + + if 'rc' in plot_type: + plot_rc_piechart(data, pngname) + + if args['--print-stats']: + print_stats(data, processing_time) + + if args['--save-stats']: + if args['--output']: + scorename = os.path.join(args['--output'], os.path.basename(scorename)) + with open(scorename, 'w') as score_file: + score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( + ''.join([page_number, '_', table_number]), + table['nrows'], + table['ncols'], + table['empty_p'], + table['line_p'], + table['text_p'], + table['score'])) + if args['--debug']: + manager.debug_plot() + except Exception as e: + logger.exception(e.message, exc_info=True) + sys.exit() + elif args[''] == 'ocrs': + try: + kwargs = { + 'table_area': args['--tarea'] if args['--tarea'] else None, + 'columns': args['--columns'] if args['--columns'] else None, + 'blocksize': int(args['--blocksize']), + 'threshold_constant': float(args['--constant']), + 'line_threshold': int(args['--line-threshold']), + 'dpi': int(args['--dpi']), + 'lang': args['--lang'], + 'debug': args['--debug'] + } + manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True, parallel=args['--parallel']) data = manager.extract() @@ -588,7 +679,7 @@ if __name__ == '__main__': logger.exception(e.message, exc_info=True) sys.exit() - if args['--debug']: + if args.get('--debug') is not None and args['--debug']: print("See 'camelot -h' for various parameters you can tweak.") else: output = filedir if args['--output'] is None else args['--output']