diff --git a/camelot/handlers.py b/camelot/handlers.py index c4bcfd8..f231d96 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -91,8 +91,8 @@ class PDFHandler(object): for p in self.pages] tables = [] geometry = [] - parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) for p in pages: + parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) t, g = parser.extract_tables(p) tables.extend(t) geometry.append(g) diff --git a/camelot/parsers.py b/camelot/parsers.py index c9b5d2a..ac9216e 100644 --- a/camelot/parsers.py +++ b/camelot/parsers.py @@ -14,7 +14,7 @@ import pandas as pd from .core import Table, Geometry from .image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) -from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, +from .utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_values, get_table_index, compute_accuracy, count_empty, get_text_objects, get_page_layout, encode_) @@ -31,17 +31,40 @@ def _reduce_method(m): copy_reg.pickle(types.MethodType, _reduce_method) -class Stream: +class BaseParser(object): + # init objects + # no tables condition + # convert pdf to image - lattice + # image processing - lattice + # user given table area condition + # scale image components to pdf components - lattice + # compute cols and rows for each table + # create table for each table + + def _generate_layout(self, filename): + self.filename = filename + self.layout, self.dimensions = get_page_layout( + self.filename, + char_margin=self.char_margin, + line_margin=self.line_margin, + word_margin=self.word_margin) + self.horizontal_text = get_text_objects(self.layout, ltype="lh") + self.vertical_text = get_text_objects(self.layout, ltype="lv") + self.pdf_width, self.pdf_height = self.dimensions + self.basename, __ = os.path.splitext(self.filename) + self.g = Geometry() + + +class Stream(BaseParser): """ """ - def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0], + def __init__(self, table_area=None, columns=None, ytol=2, mtol=0, margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, debug=False): - - self.method = 'stream' self.table_area = table_area self.columns = columns + self._validate_columns() self.ytol = ytol self.mtol = mtol self.char_margin, self.line_margin, self.word_margin = margins @@ -49,6 +72,12 @@ class Stream: self.flag_size = flag_size self.debug = debug + def _validate_columns(self): + if self.table_area is not None and self.columns is not None: + if len(self.table_area) != len(self.columns): + raise ValueError("Length of table_area and columns" + " should be equal.") + @staticmethod def _text_bbox(t_bbox): xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) @@ -136,43 +165,8 @@ class Stream: for i in range(0, len(cols) - 1)] return cols - def extract_tables(self, pdfname): - """ - - Parameters - ---------- - pdfname - - Returns - ------- - - """ - layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, - line_margin=self.line_margin, word_margin=self.word_margin) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - width, height = dim - bname, __ = os.path.splitext(pdfname) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - if not lttextlh: - warnings.warn("{0}: Page contains no text.".format( - os.path.basename(bname))) - return {os.path.basename(bname): None} - - g = Geometry() - if self.debug: - text = [] - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) - g.text = text - + def _generate_table_bbox(self): if self.table_area is not None: - if self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("{0}: Length of table area and columns" - " should be equal.".format(os.path.basename(bname))) - table_bbox = {} for area in self.table_area: x1, y1, x2, y2 = area.split(",") @@ -182,111 +176,133 @@ class Stream: y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None else: - table_bbox = {(0, 0, width, height): None} + table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + self.table_bbox = table_bbox - if len(self.ytol) == 1 and self.ytol[0] == 2: - ytolerance = copy.deepcopy(self.ytol) * len(table_bbox) - else: - ytolerance = copy.deepcopy(self.ytol) + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) + t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + self.t_bbox = t_bbox - if len(self.mtol) == 1 and self.mtol[0] == 0: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) + for direction in self.t_bbox: + self.t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) + + text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) + rows_grouped = self._group_rows(self.t_bbox['horizontal'], ytol=self.ytol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(',') + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] else: - mtolerance = copy.deepcopy(self.mtol) + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # no tables condition + warnings.warn("No tables found on {}".format( + os.path.basename(self.basename))) + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = self._merge_columns(sorted(cols), mtol=self.mtol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend([t for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right]) + outer_text = [t for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.ytol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows + + def _generate_table(self, table_idx, cols, rows): + table = Table(cols, rows) + table = table.set_all_edges() + pos_errors = [] + for direction in self.t_bbox: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) + if indices[:2] != (-1, -1): + pos_errors.append(error) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].add_text(text) + accuracy = compute_accuracy([[100, pos_errors]]) + + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace, __, __ = count_empty(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.basename).replace('page-', '')) + + return table + + def extract_tables(self, filename): + """ + + Parameters + ---------- + filename + + Returns + ------- + + """ + logger.info('Processing {}'.format(os.path.basename(filename))) + self._generate_layout(filename) + + if not self.horizontal_text: + warnings.warn("No tables found on {}".format( + os.path.basename(self.basename))) + return [], self.g + + if self.debug: + text = [] + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + self.g.text = text + + self._generate_table_bbox() _tables = [] # sort tables based on y-coord - for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): - # select elements which lie within table_bbox - t_bbox = {} - t_bbox['horizontal'] = text_in_bbox(k, lttextlh) - t_bbox['vertical'] = text_in_bbox(k, lttextlv) - for direction in t_bbox: - t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) - text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox) - rows_grouped = self._group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no]) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) - elements = [len(r) for r in rows_grouped] - - guess = False - if self.columns is not None and self.columns[table_no] != "": - # user has to input boundary columns too - # take (0, width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_no].split(',') - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - else: - guess = True - ncols = max(set(elements), key=elements.count) - len_non_mode = len(filter(lambda x: x != ncols, elements)) - if ncols == 1: - # no tables detected - warnings.warn("{0}: Page contains no tables.".format( - os.path.basename(bname))) - cols = [(t.x0, t.x1) - for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), mtol=mtolerance[table_no]) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend([t for direction in t_bbox - for t in t_bbox[direction] - if t.x0 > left and t.x1 < right]) - outer_text = [t for direction in t_bbox - for t in t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, ytolerance[table_no]) - cols = self._join_columns(cols, text_x_min, text_x_max) - - table = Table(cols, rows) - table = table.set_all_edges() - pos_errors = [] - for direction in t_bbox: - for t in t_bbox[direction]: - indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) - if indices[:2] != (-1, -1): - pos_errors.append(error) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].add_text(text) - if guess: - accuracy = compute_accuracy([[66, pos_errors], [34, [len_non_mode / len(elements)]]]) - else: - accuracy = compute_accuracy([[100, pos_errors]]) - - data = table.data - data = encode_(data) - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace, __, __ = count_empty(data) - table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_no + 1 - table.page = int(os.path.basename(bname).replace('page-', '')) - + for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), + key=lambda x: x[1], reverse=True)): + cols, rows = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows) _tables.append(table) - return _tables, g + return _tables, self.g -class Lattice: +class Lattice(BaseParser): """ """ - def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2], + def __init__(self, table_area=None, fill=None, mtol=2, jtol=2, blocksize=15, threshold_constant=-2, scale=15, iterations=0, invert=False, margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, shift_text=['l', 't'], debug=None): - - self.method = 'lattice' self.table_area = table_area self.fill = fill self.mtol = mtol @@ -326,7 +342,7 @@ class Lattice: indices.append((r_idx, c_idx, text)) return indices - + @staticmethod def _fill_spanning(t, fill=None): for f in fill: if f == "h": @@ -343,33 +359,10 @@ class Lattice: t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) return t - def extract_tables(self, pdfname): - """ - - Parameters - ---------- - pdfname - - Returns - ------- - - """ - layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, - line_margin=self.line_margin, word_margin=self.word_margin) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - width, height = dim - bname, __ = os.path.splitext(pdfname) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - if not ltchar: - warnings.warn("{0}: Page contains no text.".format( - os.path.basename(bname))) - return {os.path.basename(bname): None} - - imagename = ''.join([bname, '.png']) + def _generate_image(self): + self.imagename = ''.join([self.basename, '.png']) gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r600", pdfname + "-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename ] if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): gs_call.insert(0, "gs") @@ -378,23 +371,22 @@ class Lattice: subprocess.call(gs_call, stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) - img, threshold = adaptive_threshold(imagename, invert=self.invert, + def _generate_table_bbox(self): + self.image, self.threshold = adaptive_threshold(self.imagename, invert=self.invert, blocksize=self.blocksize, c=self.threshold_constant) - pdf_x = width - pdf_y = height - img_x = img.shape[1] - img_y = img.shape[0] - sc_x_image = img_x / float(pdf_x) - sc_y_image = img_y / float(pdf_y) - sc_x_pdf = pdf_x / float(img_x) - sc_y_pdf = pdf_y / float(img_y) - factors_image = (sc_x_image, sc_y_image, pdf_y) - factors_pdf = (sc_x_pdf, sc_y_pdf, img_y) + image_width = self.image.shape[1] + image_height = self.image.shape[0] + image_width_scaler = image_width / float(self.pdf_width) + image_height_scaler = image_height / float(self.pdf_height) + pdf_width_scaler = self.pdf_width / float(image_width) + pdf_height_scaler = self.pdf_height / float(image_height) + image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) + pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) - vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale, iterations=self.iterations) - hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale, iterations=self.iterations) + vertical_mask, vertical_segments = find_lines(self.threshold, + direction='vertical', scale=self.scale, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines(self.threshold, + direction='horizontal', scale=self.scale, iterations=self.iterations) if self.table_area is not None: areas = [] @@ -404,95 +396,118 @@ class Lattice: y1 = float(y1) x2 = float(x2) y2 = float(y2) - x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vmask, hmask) + table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask) else: - contours = find_table_contours(vmask, hmask) - table_bbox = find_table_joints(contours, vmask, hmask) + contours = find_table_contours(vertical_mask, horizontal_mask) + table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) - if len(self.mtol) == 1 and self.mtol[0] == 2: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) - else: - mtolerance = copy.deepcopy(self.mtol) + self.table_bbox_unscaled = copy.deepcopy(table_bbox) - if len(self.jtol) == 1 and self.jtol[0] == 2: - jtolerance = copy.deepcopy(self.jtol) * len(table_bbox) - else: - jtolerance = copy.deepcopy(self.jtol) + self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + table_bbox, vertical_segments, horizontal_segments, pdf_scalers) - g = Geometry() - if self.debug: - g.images = (img, table_bbox) + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + v_s, h_s = segments_in_bbox( + tk, self.vertical_segments, self.horizontal_segments) + t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) + t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + self.t_bbox = t_bbox - table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments, - h_segments, factors_pdf) + for direction in t_bbox: + t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) + + cols, rows = zip(*self.table_bbox[tk]) + cols, rows = list(cols), list(rows) + cols.extend([tk[0], tk[2]]) + rows.extend([tk[1], tk[3]]) + # sort horizontal and vertical segments + cols = merge_close_values(sorted(cols), mtol=self.mtol) + rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol) + # make grid using x and y coord of shortlisted rows and cols + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + + return cols, rows, v_s, h_s + + def _generate_table(self, table_idx, cols, rows, v_s, h_s): + table = Table(cols, rows) + # set table edges to True using ver+hor lines + table = table.set_edges(v_s, h_s, jtol=self.jtol) + # set spanning cells to True + table = table.set_spanning() + # set table border edges to True + table = table.set_border_edges() + + pos_errors = [] + for direction in self.t_bbox: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) + if indices[:2] != (-1, -1): + pos_errors.append(error) + indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].add_text(text) + accuracy = compute_accuracy([[100, pos_errors]]) + + if self.fill is not None: + table = Lattice._fill_spanning(table, fill=self.fill) + + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace, __, __ = count_empty(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.basename).replace('page-', '')) + + return table + + def extract_tables(self, filename): + """ + + Parameters + ---------- + filename + + Returns + ------- + + """ + logger.info('Processing {}'.format(os.path.basename(filename))) + self._generate_layout(filename) + + if not self.horizontal_text: + warnings.warn("No tables found on {}".format( + os.path.basename(self.basename))) + return [], self.g + + self._generate_image() + self._generate_table_bbox() if self.debug: - g.segments = (v_segments, h_segments) + self.g.images = (self.image, self.table_bbox_unscaled) + self.g.segments = (self.vertical_segments, self.horizontal_segments) _tables = [] # sort tables based on y-coord - for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): - # select elements which lie within table_bbox - t_bbox = {} - v_s, h_s = segments_bbox(k, v_segments, h_segments) - t_bbox['horizontal'] = text_in_bbox(k, lttextlh) - t_bbox['vertical'] = text_in_bbox(k, lttextlv) - for direction in t_bbox: - t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) - cols, rows = zip(*table_bbox[k]) - cols, rows = list(cols), list(rows) - cols.extend([k[0], k[2]]) - rows.extend([k[1], k[3]]) - # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) - rows = merge_close_values( - sorted(rows, reverse=True), mtol=mtolerance[table_no]) - # make grid using x and y coord of shortlisted rows and cols - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - - table = Table(cols, rows) - # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no]) - # set spanning cells to True - table = table.set_spanning() - # set table border edges to True - table = table.set_border_edges() - - pos_errors = [] - for direction in ['vertical', 'horizontal']: - for t in t_bbox[direction]: - indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) - if indices[:2] != (-1, -1): - pos_errors.append(error) - indices = self._reduce_index(table, indices, shift_text=self.shift_text) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].add_text(text) - accuracy = compute_accuracy([[100, pos_errors]]) - - if self.fill is not None: - table = self._fill_spanning(table, fill=self.fill) - - data = table.data - data = encode_(data) - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace, __, __ = count_empty(data) - table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_no + 1 - table.page = int(os.path.basename(bname).replace('page-', '')) - + for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), + key=lambda x: x[1], reverse=True)): + cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows, v_s, h_s) _tables.append(table) if self.debug: - g.tables = _tables + self.g.tables = _tables - return _tables, g \ No newline at end of file + return _tables, self.g \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index c6b1705..6ec04af 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -76,7 +76,7 @@ def rotate(x1, y1, x2, y2, angle): return xnew, ynew -def scale_to_image(k, factors): +def scale_pdf(k, factors): """ Parameters @@ -98,7 +98,7 @@ def scale_to_image(k, factors): return knew -def scale_to_pdf(tables, v_segments, h_segments, factors): +def scale_image(tables, v_segments, h_segments, factors): """ Parameters @@ -197,7 +197,7 @@ def get_rotation(lttextlh, lttextlv, ltchar): return rotation -def segments_bbox(bbox, v_segments, h_segments): +def segments_in_bbox(bbox, v_segments, h_segments): """ Parameters