Handle rotation at entry
parent
2a203a1865
commit
b01edee337
|
|
@ -8,8 +8,7 @@ import subprocess
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints)
|
find_table_joints)
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments,
|
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||||
rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
|
|
||||||
merge_close_values, get_table_index, get_score, count_empty,
|
merge_close_values, get_table_index, get_score, count_empty,
|
||||||
encode_list, get_text_objects, get_page_layout)
|
encode_list, get_text_objects, get_page_layout)
|
||||||
|
|
||||||
|
|
@ -27,7 +26,7 @@ copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
cell taking in account table rotation.
|
cell.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -192,7 +191,7 @@ class Lattice:
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
"""get_tables
|
"""Expects a single page pdf as input with rotation corrected.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -284,14 +283,12 @@ class Lattice:
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
table_data = {}
|
table_data = {}
|
||||||
|
t_bbox = {}
|
||||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||||
lh_bbox = text_in_bbox(k, lttextlh)
|
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||||
lv_bbox = text_in_bbox(k, lttextlv)
|
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||||
char_bbox = text_in_bbox(k, ltchar)
|
char_bbox = text_in_bbox(k, ltchar)
|
||||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||||
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
|
||||||
v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
|
|
||||||
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
|
||||||
for direction in t_bbox:
|
for direction in t_bbox:
|
||||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
cols, rows = zip(*table_bbox[k])
|
cols, rows = zip(*table_bbox[k])
|
||||||
|
|
@ -317,7 +314,6 @@ class Lattice:
|
||||||
while len(self.headers[table_no]) != len(cols):
|
while len(self.headers[table_no]) != len(cols):
|
||||||
self.headers[table_no].append('')
|
self.headers[table_no].append('')
|
||||||
|
|
||||||
rows, cols = rotate_table(rows, cols, table_rotation)
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s)
|
table = table.set_edges(v_s, h_s)
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,8 @@ import multiprocessing as mp
|
||||||
import cv2
|
import cv2
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
|
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Pdf']
|
__all__ = ['Pdf']
|
||||||
|
|
||||||
|
|
@ -80,11 +82,34 @@ class Pdf:
|
||||||
"""
|
"""
|
||||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||||
for p in self.pagenos:
|
for p in self.pagenos:
|
||||||
|
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||||
|
sp_name, sp_ext = os.path.splitext(sp_path)
|
||||||
page = infile.getPage(p - 1)
|
page = infile.getPage(p - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(page)
|
outfile.addPage(page)
|
||||||
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
|
with open(sp_path, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
layout, dim = get_page_layout(sp_path, char_margin=1.0,
|
||||||
|
line_margin=0.5, word_margin=0.1)
|
||||||
|
lttextlh = get_text_objects(layout, ltype="lh")
|
||||||
|
lttextlv = get_text_objects(layout, ltype="lv")
|
||||||
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
|
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||||
|
if rotation != '':
|
||||||
|
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
|
||||||
|
os.rename(sp_path, sp_new_path)
|
||||||
|
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
|
||||||
|
strict=False)
|
||||||
|
sp_out = PdfFileWriter()
|
||||||
|
sp_page = sp_in.getPage(0)
|
||||||
|
if rotation == 'left':
|
||||||
|
sp_page.rotateClockwise(90)
|
||||||
|
elif rotation == 'right':
|
||||||
|
sp_page.rotateCounterClockwise(90)
|
||||||
|
sp_out.addPage(sp_page)
|
||||||
|
with open(sp_path, 'wb') as pdf_out:
|
||||||
|
sp_out.write(pdf_out)
|
||||||
|
|
||||||
|
|
||||||
def extract(self):
|
def extract(self):
|
||||||
"""Runs table extraction by calling extractor.get_tables
|
"""Runs table extraction by calling extractor.get_tables
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,8 @@ import copy_reg
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox,
|
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
||||||
get_table_index, get_score, count_empty, encode_list,
|
encode_list, get_text_objects, get_page_layout)
|
||||||
get_text_objects, get_page_layout)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Stream']
|
__all__ = ['Stream']
|
||||||
|
|
@ -287,7 +286,7 @@ class Stream:
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
"""get_tables
|
"""Expects a single page pdf as input with rotation corrected.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
---------
|
---------
|
||||||
|
|
@ -349,12 +348,11 @@ class Stream:
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
table_data = {}
|
table_data = {}
|
||||||
lh_bbox = text_in_bbox(k, lttextlh)
|
t_bbox = {}
|
||||||
lv_bbox = text_in_bbox(k, lttextlv)
|
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||||
|
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||||
char_bbox = text_in_bbox(k, ltchar)
|
char_bbox = text_in_bbox(k, ltchar)
|
||||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||||
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
|
||||||
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
|
||||||
for direction in t_bbox:
|
for direction in t_bbox:
|
||||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
||||||
|
|
@ -370,11 +368,6 @@ class Stream:
|
||||||
# len can't be 1
|
# len can't be 1
|
||||||
cols = self.columns[table_no].split(',')
|
cols = self.columns[table_no].split(',')
|
||||||
cols = [float(c) for c in cols]
|
cols = [float(c) for c in cols]
|
||||||
if table_rotation != '':
|
|
||||||
if table_rotation == 'left':
|
|
||||||
cols = [rotate(0, 0, 0, c, -np.pi / 2)[0] for c in cols]
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
cols = [rotate(0, 0, 0, c, np.pi / 2)[0] for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
cols.insert(0, text_x_min)
|
||||||
cols.append(text_x_max)
|
cols.append(text_x_max)
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
|
|
|
||||||
176
camelot/utils.py
176
camelot/utils.py
|
|
@ -243,182 +243,6 @@ def segments_bbox(bbox, v_segments, h_segments):
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
def rotate_segments(v_s, h_s, table_rotation):
|
|
||||||
"""Rotates line segments if the table is rotated.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
v : list
|
|
||||||
List of vertical line segments.
|
|
||||||
|
|
||||||
h : list
|
|
||||||
List of horizontal line segments.
|
|
||||||
|
|
||||||
table_rotation : string
|
|
||||||
{'', 'left', 'right'}
|
|
||||||
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
vertical : list
|
|
||||||
List of rotated vertical line segments.
|
|
||||||
|
|
||||||
horizontal : list
|
|
||||||
List of rotated horizontal line segments.
|
|
||||||
"""
|
|
||||||
vertical, horizontal = [], []
|
|
||||||
if table_rotation != '':
|
|
||||||
if table_rotation == 'left':
|
|
||||||
for v in v_s:
|
|
||||||
x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
|
|
||||||
horizontal.append((x0, y0, x1, y1))
|
|
||||||
for h in h_s:
|
|
||||||
x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
|
|
||||||
vertical.append((x1, y1, x0, y0))
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
for v in v_s:
|
|
||||||
x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
|
|
||||||
horizontal.append((x1, y1, x0, y0))
|
|
||||||
for h in h_s:
|
|
||||||
x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
|
|
||||||
vertical.append((x0, y0, x1, y1))
|
|
||||||
else:
|
|
||||||
vertical = v_s
|
|
||||||
horizontal = h_s
|
|
||||||
return vertical, horizontal
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
|
|
||||||
"""Rotates bounding boxes of LTTextLineHorizontals and
|
|
||||||
LTTextLineVerticals if the table is rotated.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
lh_bbox : list
|
|
||||||
List of PDFMiner LTTextLineHorizontal objects.
|
|
||||||
|
|
||||||
lv_bbox : list
|
|
||||||
List of PDFMiner LTTextLineVertical objects.
|
|
||||||
|
|
||||||
table_rotation : string
|
|
||||||
{'', 'left', 'right'}
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t_bbox : dict
|
|
||||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
|
||||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
|
||||||
"""
|
|
||||||
t_bbox = {}
|
|
||||||
if table_rotation != '':
|
|
||||||
if table_rotation == 'left':
|
|
||||||
for t in lh_bbox:
|
|
||||||
x0, y0, x1, y1 = t.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
|
||||||
t.set_bbox((x1, y0, x0, y1))
|
|
||||||
for obj in t._objs:
|
|
||||||
if isinstance(obj, LTChar):
|
|
||||||
x0, y0, x1, y1 = obj.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
|
||||||
obj.set_bbox((x1, y0, x0, y1))
|
|
||||||
for t in lv_bbox:
|
|
||||||
x0, y0, x1, y1 = t.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
|
||||||
t.set_bbox((x0, y1, x1, y0))
|
|
||||||
for obj in t._objs:
|
|
||||||
if isinstance(obj, LTChar):
|
|
||||||
x0, y0, x1, y1 = obj.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
|
||||||
obj.set_bbox((x0, y1, x1, y0))
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
for t in lh_bbox:
|
|
||||||
x0, y0, x1, y1 = t.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
|
||||||
t.set_bbox((x0, y1, x1, y0))
|
|
||||||
for obj in t._objs:
|
|
||||||
if isinstance(obj, LTChar):
|
|
||||||
x0, y0, x1, y1 = obj.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
|
||||||
obj.set_bbox((x0, y1, x1, y0))
|
|
||||||
for t in lv_bbox:
|
|
||||||
x0, y0, x1, y1 = t.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
|
||||||
t.set_bbox((x1, y0, x0, y1))
|
|
||||||
for obj in t._objs:
|
|
||||||
if isinstance(obj, LTChar):
|
|
||||||
x0, y0, x1, y1 = obj.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
|
||||||
obj.set_bbox((x1, y0, x0, y1))
|
|
||||||
t_bbox['horizontal'] = lv_bbox
|
|
||||||
t_bbox['vertical'] = lh_bbox
|
|
||||||
else:
|
|
||||||
t_bbox['horizontal'] = lh_bbox
|
|
||||||
t_bbox['vertical'] = lv_bbox
|
|
||||||
return t_bbox
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_table(R, C, table_rotation):
|
|
||||||
"""Rotates coordinates of table rows and columns.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
R : list
|
|
||||||
List of row x-coordinates.
|
|
||||||
|
|
||||||
C : list
|
|
||||||
List of column y-coordinates.
|
|
||||||
|
|
||||||
table_rotation : string
|
|
||||||
{'', 'left', 'right'}
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
List of rotated row x-coordinates.
|
|
||||||
|
|
||||||
cols : list
|
|
||||||
List of rotated column y-coordinates.
|
|
||||||
"""
|
|
||||||
rows, cols = [], []
|
|
||||||
if table_rotation != '':
|
|
||||||
if table_rotation == 'left':
|
|
||||||
for r in R:
|
|
||||||
r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
|
|
||||||
r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
|
|
||||||
cols.append((r2, r0))
|
|
||||||
cols = sorted(cols)
|
|
||||||
for c in C:
|
|
||||||
c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
|
|
||||||
c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
|
|
||||||
rows.append((c1, c3))
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
for r in R:
|
|
||||||
r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
|
|
||||||
r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
|
|
||||||
cols.append((r0, r2))
|
|
||||||
for c in C:
|
|
||||||
c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
|
|
||||||
c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
|
|
||||||
rows.append((c3, c1))
|
|
||||||
rows = sorted(rows, reverse=True)
|
|
||||||
else:
|
|
||||||
rows = R
|
|
||||||
cols = C
|
|
||||||
return rows, cols
|
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a
|
"""Returns all text objects present inside a
|
||||||
table's bounding box.
|
table's bounding box.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue