Handle rotation at entry

pull/2/head
Vinayak Mehta 2016-10-18 15:33:38 +05:30 committed by GitHub
parent 2a203a1865
commit b01edee337
4 changed files with 38 additions and 200 deletions

View File

@ -8,8 +8,7 @@ import subprocess
from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints) find_table_joints)
from .table import Table from .table import Table
from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments, from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty, merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout) encode_list, get_text_objects, get_page_layout)
@ -27,7 +26,7 @@ copy_reg.pickle(types.MethodType, _reduce_method)
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning """Reduces index of a text object if it lies within a spanning
cell taking in account table rotation. cell.
Parameters Parameters
---------- ----------
@ -192,7 +191,7 @@ class Lattice:
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
"""get_tables """Expects a single page pdf as input with rotation corrected.
Parameters Parameters
---------- ----------
@ -284,14 +283,12 @@ class Lattice:
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
t_bbox = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments) v_s, h_s = segments_bbox(k, v_segments, h_segments)
lh_bbox = text_in_bbox(k, lttextlh) t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
lv_bbox = text_in_bbox(k, lttextlv) t_bbox['vertical'] = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar) char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
for direction in t_bbox: for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
cols, rows = zip(*table_bbox[k]) cols, rows = zip(*table_bbox[k])
@ -317,7 +314,6 @@ class Lattice:
while len(self.headers[table_no]) != len(cols): while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('') self.headers[table_no].append('')
rows, cols = rotate_table(rows, cols, table_rotation)
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s) table = table.set_edges(v_s, h_s)

View File

@ -7,6 +7,8 @@ import multiprocessing as mp
import cv2 import cv2
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
from .utils import get_page_layout, get_text_objects, get_rotation
__all__ = ['Pdf'] __all__ = ['Pdf']
@ -80,11 +82,34 @@ class Pdf:
""" """
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos: for p in self.pagenos:
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
sp_name, sp_ext = os.path.splitext(sp_path)
page = infile.getPage(p - 1) page = infile.getPage(p - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(page) outfile.addPage(page)
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f: with open(sp_path, 'wb') as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(sp_path, char_margin=1.0,
line_margin=0.5, word_margin=0.1)
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
os.rename(sp_path, sp_new_path)
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
strict=False)
sp_out = PdfFileWriter()
sp_page = sp_in.getPage(0)
if rotation == 'left':
sp_page.rotateClockwise(90)
elif rotation == 'right':
sp_page.rotateCounterClockwise(90)
sp_out.addPage(sp_page)
with open(sp_path, 'wb') as pdf_out:
sp_out.write(pdf_out)
def extract(self): def extract(self):
"""Runs table extraction by calling extractor.get_tables """Runs table extraction by calling extractor.get_tables

View File

@ -7,9 +7,8 @@ import copy_reg
import numpy as np import numpy as np
from .table import Table from .table import Table
from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox, from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
get_table_index, get_score, count_empty, encode_list, encode_list, get_text_objects, get_page_layout)
get_text_objects, get_page_layout)
__all__ = ['Stream'] __all__ = ['Stream']
@ -287,7 +286,7 @@ class Stream:
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
"""get_tables """Expects a single page pdf as input with rotation corrected.
Parameters Parameters
--------- ---------
@ -349,12 +348,11 @@ class Stream:
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
lh_bbox = text_in_bbox(k, lttextlh) t_bbox = {}
lv_bbox = text_in_bbox(k, lttextlv) t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar) char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
for direction in t_bbox: for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
@ -370,11 +368,6 @@ class Stream:
# len can't be 1 # len can't be 1
cols = self.columns[table_no].split(',') cols = self.columns[table_no].split(',')
cols = [float(c) for c in cols] cols = [float(c) for c in cols]
if table_rotation != '':
if table_rotation == 'left':
cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols]
elif table_rotation == 'right':
cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols]
cols.insert(0, text_x_min) cols.insert(0, text_x_min)
cols.append(text_x_max) cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]

View File

@ -243,182 +243,6 @@ def segments_bbox(bbox, v_segments, h_segments):
return v_s, h_s return v_s, h_s
def rotate_segments(v_s, h_s, table_rotation):
"""Rotates line segments if the table is rotated.
Parameters
----------
v : list
List of vertical line segments.
h : list
List of horizontal line segments.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
vertical : list
List of rotated vertical line segments.
horizontal : list
List of rotated horizontal line segments.
"""
vertical, horizontal = [], []
if table_rotation != '':
if table_rotation == 'left':
for v in v_s:
x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
horizontal.append((x0, y0, x1, y1))
for h in h_s:
x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
vertical.append((x1, y1, x0, y0))
elif table_rotation == 'right':
for v in v_s:
x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
horizontal.append((x1, y1, x0, y0))
for h in h_s:
x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
vertical.append((x0, y0, x1, y1))
else:
vertical = v_s
horizontal = h_s
return vertical, horizontal
def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
"""Rotates bounding boxes of LTTextLineHorizontals and
LTTextLineVerticals if the table is rotated.
Parameters
----------
lh_bbox : list
List of PDFMiner LTTextLineHorizontal objects.
lv_bbox : list
List of PDFMiner LTTextLineVertical objects.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
"""
t_bbox = {}
if table_rotation != '':
if table_rotation == 'left':
for t in lh_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
obj.set_bbox((x1, y0, x0, y1))
for t in lv_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
obj.set_bbox((x0, y1, x1, y0))
elif table_rotation == 'right':
for t in lh_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x0, y1, x1, y0))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
obj.set_bbox((x0, y1, x1, y0))
for t in lv_bbox:
x0, y0, x1, y1 = t.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
t.set_bbox((x1, y0, x0, y1))
for obj in t._objs:
if isinstance(obj, LTChar):
x0, y0, x1, y1 = obj.bbox
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
obj.set_bbox((x1, y0, x0, y1))
t_bbox['horizontal'] = lv_bbox
t_bbox['vertical'] = lh_bbox
else:
t_bbox['horizontal'] = lh_bbox
t_bbox['vertical'] = lv_bbox
return t_bbox
def rotate_table(R, C, table_rotation):
"""Rotates coordinates of table rows and columns.
Parameters
----------
R : list
List of row x-coordinates.
C : list
List of column y-coordinates.
table_rotation : string
{'', 'left', 'right'}
Returns
-------
rows : list
List of rotated row x-coordinates.
cols : list
List of rotated column y-coordinates.
"""
rows, cols = [], []
if table_rotation != '':
if table_rotation == 'left':
for r in R:
r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
cols.append((r2, r0))
cols = sorted(cols)
for c in C:
c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
rows.append((c1, c3))
elif table_rotation == 'right':
for r in R:
r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
cols.append((r0, r2))
for c in C:
c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
rows.append((c3, c1))
rows = sorted(rows, reverse=True)
else:
rows = R
cols = C
return rows, cols
def text_in_bbox(bbox, text): def text_in_bbox(bbox, text):
"""Returns all text objects present inside a """Returns all text objects present inside a
table's bounding box. table's bounding box.