Handle rotation at entry
parent
2a203a1865
commit
b01edee337
|
|
@ -8,8 +8,7 @@ import subprocess
|
|||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
find_table_joints)
|
||||
from .table import Table
|
||||
from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments,
|
||||
rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
|
||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||
merge_close_values, get_table_index, get_score, count_empty,
|
||||
encode_list, get_text_objects, get_page_layout)
|
||||
|
||||
|
|
@ -27,7 +26,7 @@ copy_reg.pickle(types.MethodType, _reduce_method)
|
|||
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
cell taking in account table rotation.
|
||||
cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -192,7 +191,7 @@ class Lattice:
|
|||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
"""get_tables
|
||||
"""Expects a single page pdf as input with rotation corrected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -284,14 +283,12 @@ class Lattice:
|
|||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = {}
|
||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||
lh_bbox = text_in_bbox(k, lttextlh)
|
||||
lv_bbox = text_in_bbox(k, lttextlv)
|
||||
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||
char_bbox = text_in_bbox(k, ltchar)
|
||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
||||
v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
|
||||
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
||||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
|
|
@ -317,7 +314,6 @@ class Lattice:
|
|||
while len(self.headers[table_no]) != len(cols):
|
||||
self.headers[table_no].append('')
|
||||
|
||||
rows, cols = rotate_table(rows, cols, table_rotation)
|
||||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import multiprocessing as mp
|
|||
import cv2
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||
|
||||
|
||||
__all__ = ['Pdf']
|
||||
|
||||
|
|
@ -80,11 +82,34 @@ class Pdf:
|
|||
"""
|
||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||
for p in self.pagenos:
|
||||
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
sp_name, sp_ext = os.path.splitext(sp_path)
|
||||
page = infile.getPage(p - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(page)
|
||||
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
|
||||
with open(sp_path, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(sp_path, char_margin=1.0,
|
||||
line_margin=0.5, word_margin=0.1)
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
if rotation != '':
|
||||
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
|
||||
os.rename(sp_path, sp_new_path)
|
||||
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
|
||||
strict=False)
|
||||
sp_out = PdfFileWriter()
|
||||
sp_page = sp_in.getPage(0)
|
||||
if rotation == 'left':
|
||||
sp_page.rotateClockwise(90)
|
||||
elif rotation == 'right':
|
||||
sp_page.rotateCounterClockwise(90)
|
||||
sp_out.addPage(sp_page)
|
||||
with open(sp_path, 'wb') as pdf_out:
|
||||
sp_out.write(pdf_out)
|
||||
|
||||
|
||||
def extract(self):
|
||||
"""Runs table extraction by calling extractor.get_tables
|
||||
|
|
|
|||
|
|
@ -7,9 +7,8 @@ import copy_reg
|
|||
import numpy as np
|
||||
|
||||
from .table import Table
|
||||
from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox,
|
||||
get_table_index, get_score, count_empty, encode_list,
|
||||
get_text_objects, get_page_layout)
|
||||
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
||||
encode_list, get_text_objects, get_page_layout)
|
||||
|
||||
|
||||
__all__ = ['Stream']
|
||||
|
|
@ -287,7 +286,7 @@ class Stream:
|
|||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
"""get_tables
|
||||
"""Expects a single page pdf as input with rotation corrected.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
|
|
@ -349,12 +348,11 @@ class Stream:
|
|||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
lh_bbox = text_in_bbox(k, lttextlh)
|
||||
lv_bbox = text_in_bbox(k, lttextlv)
|
||||
t_bbox = {}
|
||||
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||
char_bbox = text_in_bbox(k, ltchar)
|
||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
||||
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
||||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
||||
|
|
@ -370,11 +368,6 @@ class Stream:
|
|||
# len can't be 1
|
||||
cols = self.columns[table_no].split(',')
|
||||
cols = [float(c) for c in cols]
|
||||
if table_rotation != '':
|
||||
if table_rotation == 'left':
|
||||
cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols]
|
||||
elif table_rotation == 'right':
|
||||
cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
|
|
|
|||
176
camelot/utils.py
176
camelot/utils.py
|
|
@ -243,182 +243,6 @@ def segments_bbox(bbox, v_segments, h_segments):
|
|||
return v_s, h_s
|
||||
|
||||
|
||||
def rotate_segments(v_s, h_s, table_rotation):
|
||||
"""Rotates line segments if the table is rotated.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : list
|
||||
List of vertical line segments.
|
||||
|
||||
h : list
|
||||
List of horizontal line segments.
|
||||
|
||||
table_rotation : string
|
||||
{'', 'left', 'right'}
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
vertical : list
|
||||
List of rotated vertical line segments.
|
||||
|
||||
horizontal : list
|
||||
List of rotated horizontal line segments.
|
||||
"""
|
||||
vertical, horizontal = [], []
|
||||
if table_rotation != '':
|
||||
if table_rotation == 'left':
|
||||
for v in v_s:
|
||||
x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
|
||||
horizontal.append((x0, y0, x1, y1))
|
||||
for h in h_s:
|
||||
x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
|
||||
vertical.append((x1, y1, x0, y0))
|
||||
elif table_rotation == 'right':
|
||||
for v in v_s:
|
||||
x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
|
||||
horizontal.append((x1, y1, x0, y0))
|
||||
for h in h_s:
|
||||
x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
|
||||
vertical.append((x0, y0, x1, y1))
|
||||
else:
|
||||
vertical = v_s
|
||||
horizontal = h_s
|
||||
return vertical, horizontal
|
||||
|
||||
|
||||
def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
|
||||
"""Rotates bounding boxes of LTTextLineHorizontals and
|
||||
LTTextLineVerticals if the table is rotated.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lh_bbox : list
|
||||
List of PDFMiner LTTextLineHorizontal objects.
|
||||
|
||||
lv_bbox : list
|
||||
List of PDFMiner LTTextLineVertical objects.
|
||||
|
||||
table_rotation : string
|
||||
{'', 'left', 'right'}
|
||||
|
||||
Returns
|
||||
-------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
"""
|
||||
t_bbox = {}
|
||||
if table_rotation != '':
|
||||
if table_rotation == 'left':
|
||||
for t in lh_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||
t.set_bbox((x1, y0, x0, y1))
|
||||
for obj in t._objs:
|
||||
if isinstance(obj, LTChar):
|
||||
x0, y0, x1, y1 = obj.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||
obj.set_bbox((x1, y0, x0, y1))
|
||||
for t in lv_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||
t.set_bbox((x0, y1, x1, y0))
|
||||
for obj in t._objs:
|
||||
if isinstance(obj, LTChar):
|
||||
x0, y0, x1, y1 = obj.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||
obj.set_bbox((x0, y1, x1, y0))
|
||||
elif table_rotation == 'right':
|
||||
for t in lh_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||
t.set_bbox((x0, y1, x1, y0))
|
||||
for obj in t._objs:
|
||||
if isinstance(obj, LTChar):
|
||||
x0, y0, x1, y1 = obj.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||
obj.set_bbox((x0, y1, x1, y0))
|
||||
for t in lv_bbox:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||
t.set_bbox((x1, y0, x0, y1))
|
||||
for obj in t._objs:
|
||||
if isinstance(obj, LTChar):
|
||||
x0, y0, x1, y1 = obj.bbox
|
||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||
obj.set_bbox((x1, y0, x0, y1))
|
||||
t_bbox['horizontal'] = lv_bbox
|
||||
t_bbox['vertical'] = lh_bbox
|
||||
else:
|
||||
t_bbox['horizontal'] = lh_bbox
|
||||
t_bbox['vertical'] = lv_bbox
|
||||
return t_bbox
|
||||
|
||||
|
||||
def rotate_table(R, C, table_rotation):
|
||||
"""Rotates coordinates of table rows and columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
R : list
|
||||
List of row x-coordinates.
|
||||
|
||||
C : list
|
||||
List of column y-coordinates.
|
||||
|
||||
table_rotation : string
|
||||
{'', 'left', 'right'}
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
List of rotated row x-coordinates.
|
||||
|
||||
cols : list
|
||||
List of rotated column y-coordinates.
|
||||
"""
|
||||
rows, cols = [], []
|
||||
if table_rotation != '':
|
||||
if table_rotation == 'left':
|
||||
for r in R:
|
||||
r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
|
||||
r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
|
||||
cols.append((r2, r0))
|
||||
cols = sorted(cols)
|
||||
for c in C:
|
||||
c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
|
||||
c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
|
||||
rows.append((c1, c3))
|
||||
elif table_rotation == 'right':
|
||||
for r in R:
|
||||
r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
|
||||
r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
|
||||
cols.append((r0, r2))
|
||||
for c in C:
|
||||
c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
|
||||
c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
|
||||
rows.append((c3, c1))
|
||||
rows = sorted(rows, reverse=True)
|
||||
else:
|
||||
rows = R
|
||||
cols = C
|
||||
return rows, cols
|
||||
|
||||
|
||||
def text_in_bbox(bbox, text):
|
||||
"""Returns all text objects present inside a
|
||||
table's bounding box.
|
||||
|
|
|
|||
Loading…
Reference in New Issue