diff --git a/camelot/lattice.py b/camelot/lattice.py index a9cd806..da8941a 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -8,8 +8,7 @@ import subprocess from .imgproc import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) from .table import Table -from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments, - rotate_textlines, rotate_table, segments_bbox, text_in_bbox, +from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, merge_close_values, get_table_index, get_score, count_empty, encode_list, get_text_objects, get_page_layout) @@ -27,7 +26,7 @@ copy_reg.pickle(types.MethodType, _reduce_method) def _reduce_index(t, idx, shift_text): """Reduces index of a text object if it lies within a spanning - cell taking in account table rotation. + cell. Parameters ---------- @@ -192,7 +191,7 @@ class Lattice: self.debug = debug def get_tables(self, pdfname): - """get_tables + """Expects a single page pdf as input with rotation corrected. Parameters ---------- @@ -284,14 +283,12 @@ class Lattice: for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): # select elements which lie within table_bbox table_data = {} + t_bbox = {} v_s, h_s = segments_bbox(k, v_segments, h_segments) - lh_bbox = text_in_bbox(k, lttextlh) - lv_bbox = text_in_bbox(k, lttextlv) + t_bbox['horizontal'] = text_in_bbox(k, lttextlh) + t_bbox['vertical'] = text_in_bbox(k, lttextlv) char_bbox = text_in_bbox(k, ltchar) table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) - table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox) - v_s, h_s = rotate_segments(v_s, h_s, table_rotation) - t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation) for direction in t_bbox: t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) cols, rows = zip(*table_bbox[k]) @@ -317,7 +314,6 @@ class Lattice: while len(self.headers[table_no]) != len(cols): self.headers[table_no].append('') - rows, cols = rotate_table(rows, cols, table_rotation) table = Table(cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s) diff --git a/camelot/pdf.py b/camelot/pdf.py index e1bfdb2..5318626 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -7,6 +7,8 @@ import multiprocessing as mp import cv2 from PyPDF2 import PdfFileReader, PdfFileWriter +from .utils import get_page_layout, get_text_objects, get_rotation + __all__ = ['Pdf'] @@ -80,11 +82,34 @@ class Pdf: """ infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) for p in self.pagenos: + sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p)) + sp_name, sp_ext = os.path.splitext(sp_path) page = infile.getPage(p - 1) outfile = PdfFileWriter() outfile.addPage(page) - with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f: + with open(sp_path, 'wb') as f: outfile.write(f) + layout, dim = get_page_layout(sp_path, char_margin=1.0, + line_margin=0.5, word_margin=0.1) + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") + rotation = get_rotation(lttextlh, lttextlv, ltchar) + if rotation != '': + sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext]) + os.rename(sp_path, sp_new_path) + sp_in = PdfFileReader(open(sp_new_path, 'rb'), + strict=False) + sp_out = PdfFileWriter() + sp_page = sp_in.getPage(0) + if rotation == 'left': + sp_page.rotateClockwise(90) + elif rotation == 'right': + sp_page.rotateCounterClockwise(90) + sp_out.addPage(sp_page) + with open(sp_path, 'wb') as pdf_out: + sp_out.write(pdf_out) + def extract(self): """Runs table extraction by calling extractor.get_tables diff --git a/camelot/stream.py b/camelot/stream.py index d799099..21be6ad 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -7,9 +7,8 @@ import copy_reg import numpy as np from .table import Table -from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox, - get_table_index, get_score, count_empty, encode_list, - get_text_objects, get_page_layout) +from .utils import (text_in_bbox, get_table_index, get_score, count_empty, + encode_list, get_text_objects, get_page_layout) __all__ = ['Stream'] @@ -287,7 +286,7 @@ class Stream: self.debug = debug def get_tables(self, pdfname): - """get_tables + """Expects a single page pdf as input with rotation corrected. Parameters --------- @@ -349,12 +348,11 @@ class Stream: for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): # select elements which lie within table_bbox table_data = {} - lh_bbox = text_in_bbox(k, lttextlh) - lv_bbox = text_in_bbox(k, lttextlv) + t_bbox = {} + t_bbox['horizontal'] = text_in_bbox(k, lttextlh) + t_bbox['vertical'] = text_in_bbox(k, lttextlv) char_bbox = text_in_bbox(k, ltchar) table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) - table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox) - t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation) for direction in t_bbox: t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) @@ -370,11 +368,6 @@ class Stream: # len can't be 1 cols = self.columns[table_no].split(',') cols = [float(c) for c in cols] - if table_rotation != '': - if table_rotation == 'left': - cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols] - elif table_rotation == 'right': - cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols] cols.insert(0, text_x_min) cols.append(text_x_max) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] diff --git a/camelot/utils.py b/camelot/utils.py index ee28ee6..1f8e5e6 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -243,182 +243,6 @@ def segments_bbox(bbox, v_segments, h_segments): return v_s, h_s -def rotate_segments(v_s, h_s, table_rotation): - """Rotates line segments if the table is rotated. - - Parameters - ---------- - v : list - List of vertical line segments. - - h : list - List of horizontal line segments. - - table_rotation : string - {'', 'left', 'right'} - - - Returns - ------- - vertical : list - List of rotated vertical line segments. - - horizontal : list - List of rotated horizontal line segments. - """ - vertical, horizontal = [], [] - if table_rotation != '': - if table_rotation == 'left': - for v in v_s: - x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2) - x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2) - horizontal.append((x0, y0, x1, y1)) - for h in h_s: - x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2) - x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2) - vertical.append((x1, y1, x0, y0)) - elif table_rotation == 'right': - for v in v_s: - x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2) - x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2) - horizontal.append((x1, y1, x0, y0)) - for h in h_s: - x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2) - x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2) - vertical.append((x0, y0, x1, y1)) - else: - vertical = v_s - horizontal = h_s - return vertical, horizontal - - -def rotate_textlines(lh_bbox, lv_bbox, table_rotation): - """Rotates bounding boxes of LTTextLineHorizontals and - LTTextLineVerticals if the table is rotated. - - Parameters - ---------- - lh_bbox : list - List of PDFMiner LTTextLineHorizontal objects. - - lv_bbox : list - List of PDFMiner LTTextLineVertical objects. - - table_rotation : string - {'', 'left', 'right'} - - Returns - ------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - """ - t_bbox = {} - if table_rotation != '': - if table_rotation == 'left': - for t in lh_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) - t.set_bbox((x1, y0, x0, y1)) - for obj in t._objs: - if isinstance(obj, LTChar): - x0, y0, x1, y1 = obj.bbox - x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) - obj.set_bbox((x1, y0, x0, y1)) - for t in lv_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) - t.set_bbox((x0, y1, x1, y0)) - for obj in t._objs: - if isinstance(obj, LTChar): - x0, y0, x1, y1 = obj.bbox - x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2) - obj.set_bbox((x0, y1, x1, y0)) - elif table_rotation == 'right': - for t in lh_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) - t.set_bbox((x0, y1, x1, y0)) - for obj in t._objs: - if isinstance(obj, LTChar): - x0, y0, x1, y1 = obj.bbox - x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) - obj.set_bbox((x0, y1, x1, y0)) - for t in lv_bbox: - x0, y0, x1, y1 = t.bbox - x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) - t.set_bbox((x1, y0, x0, y1)) - for obj in t._objs: - if isinstance(obj, LTChar): - x0, y0, x1, y1 = obj.bbox - x0, y0 = rotate(0, 0, x0, y0, np.pi / 2) - x1, y1 = rotate(0, 0, x1, y1, np.pi / 2) - obj.set_bbox((x1, y0, x0, y1)) - t_bbox['horizontal'] = lv_bbox - t_bbox['vertical'] = lh_bbox - else: - t_bbox['horizontal'] = lh_bbox - t_bbox['vertical'] = lv_bbox - return t_bbox - - -def rotate_table(R, C, table_rotation): - """Rotates coordinates of table rows and columns. - - Parameters - ---------- - R : list - List of row x-coordinates. - - C : list - List of column y-coordinates. - - table_rotation : string - {'', 'left', 'right'} - - Returns - ------- - rows : list - List of rotated row x-coordinates. - - cols : list - List of rotated column y-coordinates. - """ - rows, cols = [], [] - if table_rotation != '': - if table_rotation == 'left': - for r in R: - r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2) - r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2) - cols.append((r2, r0)) - cols = sorted(cols) - for c in C: - c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2) - c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2) - rows.append((c1, c3)) - elif table_rotation == 'right': - for r in R: - r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2) - r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2) - cols.append((r0, r2)) - for c in C: - c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2) - c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2) - rows.append((c3, c1)) - rows = sorted(rows, reverse=True) - else: - rows = R - cols = C - return rows, cols - - def text_in_bbox(bbox, text): """Returns all text objects present inside a table's bounding box.