251 lines
9.9 KiB
Python
251 lines
9.9 KiB
Python
import os
|
|
import shutil
|
|
import tempfile
|
|
import itertools
|
|
import multiprocessing as mp
|
|
|
|
import cv2
|
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
|
|
|
from .utils import get_page_layout, get_text_objects, get_rotation
|
|
|
|
|
|
__all__ = ['Pdf']
|
|
|
|
|
|
def _parse_page_numbers(pagenos):
|
|
"""Converts list of dicts to list of ints.
|
|
|
|
Parameters
|
|
----------
|
|
pagenos : list
|
|
List of dicts representing page ranges. A dict must have only
|
|
two keys named 'start' and 'end' having int as their value.
|
|
|
|
Returns
|
|
-------
|
|
page_numbers : list
|
|
List of int page numbers.
|
|
"""
|
|
page_numbers = []
|
|
for p in pagenos:
|
|
page_numbers.extend(range(p['start'], p['end'] + 1))
|
|
page_numbers = sorted(set(page_numbers))
|
|
return page_numbers
|
|
|
|
|
|
class Pdf:
|
|
"""Pdf manager.
|
|
Handles all operations like temp directory creation, splitting file
|
|
into single page pdfs, running extraction using multiple processes
|
|
and removing the temp directory.
|
|
|
|
Parameters
|
|
----------
|
|
extractor : object
|
|
camelot.stream.Stream or camelot.lattice.Lattice extractor
|
|
object.
|
|
|
|
pdfname : string
|
|
Path to pdf file.
|
|
|
|
pagenos : list
|
|
List of dicts representing page ranges. A dict must have only
|
|
two keys named 'start' and 'end' having int as their value.
|
|
(optional, default: [{'start': 1, 'end': 1}])
|
|
|
|
parallel : bool
|
|
Whether or not to run using multiple processes.
|
|
(optional, default: False)
|
|
|
|
clean : bool
|
|
Whether or not to remove the temp directory.
|
|
(optional, default: False)
|
|
"""
|
|
|
|
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
|
parallel=False, clean=False):
|
|
|
|
self.extractor = extractor
|
|
self.pdfname = pdfname
|
|
if not self.pdfname.endswith('.pdf'):
|
|
raise TypeError("Only PDF format is supported right now.")
|
|
self.pagenos = _parse_page_numbers(pagenos)
|
|
self.parallel = parallel
|
|
self.cpu_count = mp.cpu_count()
|
|
self.pool = mp.Pool(processes=self.cpu_count)
|
|
self.clean = clean
|
|
self.temp = tempfile.mkdtemp()
|
|
|
|
def split(self):
|
|
"""Splits file into single page pdfs.
|
|
"""
|
|
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
|
for p in self.pagenos:
|
|
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
|
sp_name, sp_ext = os.path.splitext(sp_path)
|
|
page = infile.getPage(p - 1)
|
|
outfile = PdfFileWriter()
|
|
outfile.addPage(page)
|
|
with open(sp_path, 'wb') as f:
|
|
outfile.write(f)
|
|
layout, dim = get_page_layout(sp_path, char_margin=1.0,
|
|
line_margin=0.5, word_margin=0.1)
|
|
lttextlh = get_text_objects(layout, ltype="lh")
|
|
lttextlv = get_text_objects(layout, ltype="lv")
|
|
ltchar = get_text_objects(layout, ltype="char")
|
|
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
|
if rotation != '':
|
|
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
|
|
os.rename(sp_path, sp_new_path)
|
|
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
|
|
strict=False)
|
|
sp_out = PdfFileWriter()
|
|
sp_page = sp_in.getPage(0)
|
|
if rotation == 'left':
|
|
sp_page.rotateClockwise(90)
|
|
elif rotation == 'right':
|
|
sp_page.rotateCounterClockwise(90)
|
|
sp_out.addPage(sp_page)
|
|
with open(sp_path, 'wb') as pdf_out:
|
|
sp_out.write(pdf_out)
|
|
|
|
|
|
def extract(self):
|
|
"""Runs table extraction by calling extractor.get_tables
|
|
on all single page pdfs.
|
|
"""
|
|
self.split()
|
|
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
|
for p in self.pagenos]
|
|
if self.parallel:
|
|
tables = self.pool.map(self.extractor.get_tables, pages)
|
|
tables = {k: v for d in tables if d is not None for k, v in d.items()}
|
|
else:
|
|
tables = {}
|
|
if self.extractor.debug:
|
|
if self.extractor.method == 'stream':
|
|
self.debug = self.extractor.debug
|
|
self.debug_text = []
|
|
elif self.extractor.method == 'lattice':
|
|
self.debug = self.extractor.debug
|
|
self.debug_images = []
|
|
self.debug_segments = []
|
|
self.debug_tables = []
|
|
for p in pages:
|
|
table = self.extractor.get_tables(p)
|
|
if table is not None:
|
|
tables.update(table)
|
|
if self.extractor.debug:
|
|
if self.extractor.method == 'stream':
|
|
self.debug_text.append(self.extractor.debug_text)
|
|
elif self.extractor.method == 'lattice':
|
|
self.debug_images.append(self.extractor.debug_images)
|
|
self.debug_segments.append(self.extractor.debug_segments)
|
|
self.debug_tables.append(self.extractor.debug_tables)
|
|
if self.clean:
|
|
self.remove_tempdir()
|
|
return tables
|
|
|
|
def remove_tempdir(self):
|
|
"""Removes temporary directory that was created to save single
|
|
page pdfs and their images.
|
|
"""
|
|
shutil.rmtree(self.temp)
|
|
|
|
def debug_plot(self):
|
|
"""Generates a matplotlib plot based on the selected extractor
|
|
debug option.
|
|
"""
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
|
|
if self.debug is True:
|
|
try:
|
|
for text in self.debug_text:
|
|
fig = plt.figure()
|
|
ax = fig.add_subplot(111, aspect='equal')
|
|
xs, ys = [], []
|
|
for t in text:
|
|
xs.extend([t[0], t[1]])
|
|
ys.extend([t[2], t[3]])
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(t[0], t[1]),
|
|
t[2] - t[0],
|
|
t[3] - t[1]
|
|
)
|
|
)
|
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
plt.show()
|
|
except AttributeError:
|
|
raise ValueError("This option only be used with Stream.")
|
|
elif self.debug == 'contour':
|
|
try:
|
|
for img, table_bbox in self.debug_images:
|
|
for t in table_bbox.keys():
|
|
cv2.rectangle(img, (t[0], t[1]),
|
|
(t[2], t[3]), (255, 0, 0), 3)
|
|
plt.imshow(img)
|
|
plt.show()
|
|
except AttributeError:
|
|
raise ValueError("This option only be used with Lattice.")
|
|
elif self.debug == 'joint':
|
|
try:
|
|
for img, table_bbox in self.debug_images:
|
|
x_coord = []
|
|
y_coord = []
|
|
for k in table_bbox.keys():
|
|
for coord in table_bbox[k]:
|
|
x_coord.append(coord[0])
|
|
y_coord.append(coord[1])
|
|
max_x, max_y = max(x_coord), max(y_coord)
|
|
plt.plot(x_coord, y_coord, 'ro')
|
|
plt.axis([0, max_x + 100, max_y + 100, 0])
|
|
plt.imshow(img)
|
|
plt.show()
|
|
except AttributeError:
|
|
raise ValueError("This option only be used with Lattice.")
|
|
elif self.debug == 'line':
|
|
try:
|
|
for v_s, h_s in self.debug_segments:
|
|
for v in v_s:
|
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
|
for h in h_s:
|
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
|
plt.show()
|
|
except AttributeError:
|
|
raise ValueError("This option only be used with Lattice.")
|
|
elif self.debug == 'table':
|
|
try:
|
|
for tables in self.debug_tables:
|
|
for table in tables:
|
|
for r in range(len(table.rows)):
|
|
for c in range(len(table.cols)):
|
|
if table.cells[r][c].left:
|
|
plt.plot([table.cells[r][c].lb[0],
|
|
table.cells[r][c].lt[0]],
|
|
[table.cells[r][c].lb[1],
|
|
table.cells[r][c].lt[1]])
|
|
if table.cells[r][c].right:
|
|
plt.plot([table.cells[r][c].rb[0],
|
|
table.cells[r][c].rt[0]],
|
|
[table.cells[r][c].rb[1],
|
|
table.cells[r][c].rt[1]])
|
|
if table.cells[r][c].top:
|
|
plt.plot([table.cells[r][c].lt[0],
|
|
table.cells[r][c].rt[0]],
|
|
[table.cells[r][c].lt[1],
|
|
table.cells[r][c].rt[1]])
|
|
if table.cells[r][c].bottom:
|
|
plt.plot([table.cells[r][c].lb[0],
|
|
table.cells[r][c].rb[0]],
|
|
[table.cells[r][c].lb[1],
|
|
table.cells[r][c].rb[1]])
|
|
plt.show()
|
|
except AttributeError:
|
|
raise ValueError("This option only be used with Lattice.")
|
|
else:
|
|
raise UserWarning("This method can only be called after"
|
|
" debug has been specified.") |