305 lines
11 KiB
Python
305 lines
11 KiB
Python
import os
|
|
import tempfile
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
|
|
|
from .utils import get_page_layout, get_text_objects, get_rotation
|
|
|
|
|
|
class Cell(object):
|
|
def __init__(self, x1, y1, x2, y2):
|
|
|
|
self.x1 = x1
|
|
self.y1 = y1
|
|
self.x2 = x2
|
|
self.y2 = y2
|
|
self.lb = (x1, y1)
|
|
self.lt = (x1, y2)
|
|
self.rb = (x2, y1)
|
|
self.rt = (x2, y2)
|
|
self.bbox = (x1, y1, x2, y2)
|
|
self.left = False
|
|
self.right = False
|
|
self.top = False
|
|
self.bottom = False
|
|
self.text_objects = []
|
|
self.text = ''
|
|
self.spanning_h = False
|
|
self.spanning_v = False
|
|
self.image = None
|
|
|
|
def add_text(self, text):
|
|
self.text = ''.join([self.text, text])
|
|
|
|
def get_text(self):
|
|
return self.text
|
|
|
|
def add_object(self, t_object):
|
|
self.text_objects.append(t_object)
|
|
|
|
def get_objects(self):
|
|
return self.text_objects
|
|
|
|
def get_bounded_edges(self):
|
|
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
|
return self.bounded_edges
|
|
|
|
|
|
class Table(object):
|
|
def __init__(self, cols, rows):
|
|
|
|
self.cols = cols
|
|
self.rows = rows
|
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
|
for c in cols] for r in rows]
|
|
self.nocont_ = 0
|
|
self.image = None
|
|
|
|
def set_all_edges(self):
|
|
for r in range(len(self.rows)):
|
|
for c in range(len(self.cols)):
|
|
self.cells[r][c].left = True
|
|
self.cells[r][c].right = True
|
|
self.cells[r][c].top = True
|
|
self.cells[r][c].bottom = True
|
|
return self
|
|
|
|
def set_border_edges(self):
|
|
for r in range(len(self.rows)):
|
|
self.cells[r][0].left = True
|
|
self.cells[r][len(self.cols) - 1].right = True
|
|
for c in range(len(self.cols)):
|
|
self.cells[0][c].top = True
|
|
self.cells[len(self.rows) - 1][c].bottom = True
|
|
return self
|
|
|
|
def set_edges(self, vertical, horizontal, jtol=2):
|
|
for v in vertical:
|
|
# find closest x coord
|
|
# iterate over y coords and find closest points
|
|
i = [i for i, t in enumerate(self.cols)
|
|
if np.isclose(v[0], t[0], atol=jtol)]
|
|
j = [j for j, t in enumerate(self.rows)
|
|
if np.isclose(v[3], t[0], atol=jtol)]
|
|
k = [k for k, t in enumerate(self.rows)
|
|
if np.isclose(v[1], t[0], atol=jtol)]
|
|
if not j:
|
|
self.nocont_ += 1
|
|
continue
|
|
J = j[0]
|
|
if i == [0]: # only left edge
|
|
I = i[0]
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[J][I].left = True
|
|
J += 1
|
|
else:
|
|
K = len(self.rows)
|
|
while J < K:
|
|
self.cells[J][I].left = True
|
|
J += 1
|
|
elif i == []: # only right edge
|
|
I = len(self.cols) - 1
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[J][I].right = True
|
|
J += 1
|
|
else:
|
|
K = len(self.rows)
|
|
while J < K:
|
|
self.cells[J][I].right = True
|
|
J += 1
|
|
else: # both left and right edges
|
|
I = i[0]
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[J][I].left = True
|
|
self.cells[J][I - 1].right = True
|
|
J += 1
|
|
else:
|
|
K = len(self.rows)
|
|
while J < K:
|
|
self.cells[J][I].left = True
|
|
self.cells[J][I - 1].right = True
|
|
J += 1
|
|
|
|
for h in horizontal:
|
|
# find closest y coord
|
|
# iterate over x coords and find closest points
|
|
i = [i for i, t in enumerate(self.rows)
|
|
if np.isclose(h[1], t[0], atol=jtol)]
|
|
j = [j for j, t in enumerate(self.cols)
|
|
if np.isclose(h[0], t[0], atol=jtol)]
|
|
k = [k for k, t in enumerate(self.cols)
|
|
if np.isclose(h[2], t[0], atol=jtol)]
|
|
if not j:
|
|
self.nocont_ += 1
|
|
continue
|
|
J = j[0]
|
|
if i == [0]: # only top edge
|
|
I = i[0]
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[I][J].top = True
|
|
J += 1
|
|
else:
|
|
K = len(self.cols)
|
|
while J < K:
|
|
self.cells[I][J].top = True
|
|
J += 1
|
|
elif i == []: # only bottom edge
|
|
I = len(self.rows) - 1
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[I][J].bottom = True
|
|
J += 1
|
|
else:
|
|
K = len(self.cols)
|
|
while J < K:
|
|
self.cells[I][J].bottom = True
|
|
J += 1
|
|
else: # both top and bottom edges
|
|
I = i[0]
|
|
if k:
|
|
K = k[0]
|
|
while J < K:
|
|
self.cells[I][J].top = True
|
|
self.cells[I - 1][J].bottom = True
|
|
J += 1
|
|
else:
|
|
K = len(self.cols)
|
|
while J < K:
|
|
self.cells[I][J].top = True
|
|
self.cells[I - 1][J].bottom = True
|
|
J += 1
|
|
|
|
return self
|
|
|
|
def set_spanning(self):
|
|
for r in range(len(self.rows)):
|
|
for c in range(len(self.cols)):
|
|
bound = self.cells[r][c].get_bounded_edges()
|
|
if bound == 4:
|
|
continue
|
|
elif bound == 3:
|
|
if not self.cells[r][c].left:
|
|
if (self.cells[r][c].right and
|
|
self.cells[r][c].top and
|
|
self.cells[r][c].bottom):
|
|
self.cells[r][c].spanning_h = True
|
|
elif not self.cells[r][c].right:
|
|
if (self.cells[r][c].left and
|
|
self.cells[r][c].top and
|
|
self.cells[r][c].bottom):
|
|
self.cells[r][c].spanning_h = True
|
|
elif not self.cells[r][c].top:
|
|
if (self.cells[r][c].left and
|
|
self.cells[r][c].right and
|
|
self.cells[r][c].bottom):
|
|
self.cells[r][c].spanning_v = True
|
|
elif not self.cells[r][c].bottom:
|
|
if (self.cells[r][c].left and
|
|
self.cells[r][c].right and
|
|
self.cells[r][c].top):
|
|
self.cells[r][c].spanning_v = True
|
|
elif bound == 2:
|
|
if self.cells[r][c].left and self.cells[r][c].right:
|
|
if (not self.cells[r][c].top and
|
|
not self.cells[r][c].bottom):
|
|
self.cells[r][c].spanning_v = True
|
|
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
|
if (not self.cells[r][c].left and
|
|
not self.cells[r][c].right):
|
|
self.cells[r][c].spanning_h = True
|
|
|
|
return self
|
|
|
|
def get_list(self):
|
|
ar = []
|
|
for r in range(len(self.rows)):
|
|
ar.append([self.cells[r][c].get_text().strip()
|
|
for c in range(len(self.cols))])
|
|
return ar
|
|
|
|
|
|
class TableSet(object):
|
|
pass
|
|
|
|
|
|
class FileHandler(object):
|
|
def __init__(self, filename, pages='1'):
|
|
self.filename = filename
|
|
if not self.filename.endswith('.pdf'):
|
|
raise TypeError("File format not supported.")
|
|
self.pages = __get_pages(pages)
|
|
self.temp = tempfile.mkdtemp()
|
|
|
|
@staticmethod
|
|
def __get_pages(filename, pages):
|
|
p = {}
|
|
if pages == '1':
|
|
p.append({'start': 1, 'end': 1})
|
|
else:
|
|
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
|
if pages == 'all':
|
|
p.append({'start': 1, 'end': infile.getNumPages()})
|
|
else:
|
|
for r in pages.split(','):
|
|
if '-' in r:
|
|
a, b = r.split('-')
|
|
if b == 'end':
|
|
b = infile.getNumPages()
|
|
p.append({'start': int(a), 'end': int(b)})
|
|
else:
|
|
p.append({'start': int(r), 'end': int(r)})
|
|
return p
|
|
|
|
@staticmethod
|
|
def __save_page(filename, page, temp):
|
|
with open(filename, 'rb') as fileobj:
|
|
infile = PdfFileReader(fileobj, strict=False)
|
|
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
|
fname, fext = os.path.splitext(fpath)
|
|
p = infile.getPage(page - 1)
|
|
outfile = PdfFileWriter()
|
|
outfile.addPage(p)
|
|
with open(fpath, 'wb') as f:
|
|
outfile.write(f)
|
|
layout, dim = get_page_layout(fpath)
|
|
# fix rotated pdf
|
|
lttextlh = get_text_objects(layout, ltype="lh")
|
|
lttextlv = get_text_objects(layout, ltype="lv")
|
|
ltchar = get_text_objects(layout, ltype="char")
|
|
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
|
if rotation != '':
|
|
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
|
|
os.rename(fpath, fpath_new)
|
|
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
|
outfile = PdfFileWriter()
|
|
p = infile.getPage(0)
|
|
if rotation == 'left':
|
|
p.rotateClockwise(90)
|
|
elif rotation == 'right':
|
|
p.rotateCounterClockwise(90)
|
|
outfile.addPage(p)
|
|
with open(fpath, 'wb') as f:
|
|
outfile.write(f)
|
|
|
|
def parse(self):
|
|
for p in self.pages:
|
|
__save_page(self.filename, p, self.temp)
|
|
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
|
for p in self.pagenos]
|
|
tables = {}
|
|
for p in pages:
|
|
table = self.parser.get_tables(p)
|
|
if table is not None:
|
|
tables.update(table)
|
|
return tables |