Move Pdf class to core as FileHandler
parent
0c9e21d881
commit
5d29f0c21c
220
camelot/core.py
220
camelot/core.py
|
|
@ -1,69 +1,14 @@
|
|||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||
|
||||
|
||||
class Cell(object):
|
||||
"""Cell.
|
||||
Defines a cell object with coordinates relative to a left-bottom
|
||||
origin, which is also PDFMiner's coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
x-coordinate of left-bottom point.
|
||||
|
||||
y1 : float
|
||||
y-coordinate of left-bottom point.
|
||||
|
||||
x2 : float
|
||||
x-coordinate of right-top point.
|
||||
|
||||
y2 : float
|
||||
y-coordinate of right-top point.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lb : tuple
|
||||
Tuple representing left-bottom coordinates.
|
||||
|
||||
lt : tuple
|
||||
Tuple representing left-top coordinates.
|
||||
|
||||
rb : tuple
|
||||
Tuple representing right-bottom coordinates.
|
||||
|
||||
rt : tuple
|
||||
Tuple representing right-top coordinates.
|
||||
|
||||
bbox : tuple
|
||||
Tuple representing the cell's bounding box using the
|
||||
lower-bottom and right-top coordinates.
|
||||
|
||||
left : bool
|
||||
Whether or not cell is bounded on the left.
|
||||
|
||||
right : bool
|
||||
Whether or not cell is bounded on the right.
|
||||
|
||||
top : bool
|
||||
Whether or not cell is bounded on the top.
|
||||
|
||||
bottom : bool
|
||||
Whether or not cell is bounded on the bottom.
|
||||
|
||||
text_objects : list
|
||||
List of text objects assigned to cell.
|
||||
|
||||
text : string
|
||||
Text assigned to cell.
|
||||
|
||||
spanning_h : bool
|
||||
Whether or not cell spans/extends horizontally.
|
||||
|
||||
spanning_v : bool
|
||||
Whether or not cell spans/extends vertically.
|
||||
"""
|
||||
|
||||
def __init__(self, x1, y1, x2, y2):
|
||||
|
||||
self.x1 = x1
|
||||
|
|
@ -86,76 +31,23 @@ class Cell(object):
|
|||
self.image = None
|
||||
|
||||
def add_text(self, text):
|
||||
"""Adds text to cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : string
|
||||
"""
|
||||
self.text = ''.join([self.text, text])
|
||||
|
||||
def get_text(self):
|
||||
"""Returns text assigned to cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text : string
|
||||
"""
|
||||
return self.text
|
||||
|
||||
def add_object(self, t_object):
|
||||
"""Adds PDFMiner text object to cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_object : object
|
||||
"""
|
||||
self.text_objects.append(t_object)
|
||||
|
||||
def get_objects(self):
|
||||
"""Returns list of text objects assigned to cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_objects : list
|
||||
"""
|
||||
return self.text_objects
|
||||
|
||||
def get_bounded_edges(self):
|
||||
"""Returns the number of edges by which a cell is bounded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bounded_edges : int
|
||||
"""
|
||||
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
||||
return self.bounded_edges
|
||||
|
||||
|
||||
class Table(object):
|
||||
"""Table.
|
||||
Defines a table object with coordinates relative to a left-bottom
|
||||
origin, which is also PDFMiner's coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of tuples representing column x-coordinates in increasing
|
||||
order.
|
||||
|
||||
rows : list
|
||||
List of tuples representing row y-coordinates in decreasing
|
||||
order.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cells : list
|
||||
List of cell objects with row-major ordering.
|
||||
|
||||
nocont_ : int
|
||||
Number of lines that did not contribute to setting cell edges.
|
||||
"""
|
||||
|
||||
def __init__(self, cols, rows):
|
||||
|
||||
self.cols = cols
|
||||
|
|
@ -166,8 +58,6 @@ class Table(object):
|
|||
self.image = None
|
||||
|
||||
def set_all_edges(self):
|
||||
"""Sets all table edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[r][c].left = True
|
||||
|
|
@ -177,8 +67,6 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def set_border_edges(self):
|
||||
"""Sets table border edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
self.cells[r][0].left = True
|
||||
self.cells[r][len(self.cols) - 1].right = True
|
||||
|
|
@ -188,19 +76,6 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def set_edges(self, vertical, horizontal, jtol=2):
|
||||
"""Sets a cell's edges to True depending on whether they
|
||||
overlap with lines found by imgproc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical : list
|
||||
List of vertical lines detected by imgproc. Coordinates
|
||||
scaled and translated to the PDFMiner's coordinate space.
|
||||
|
||||
horizontal : list
|
||||
List of horizontal lines detected by imgproc. Coordinates
|
||||
scaled and translated to the PDFMiner's coordinate space.
|
||||
"""
|
||||
for v in vertical:
|
||||
# find closest x coord
|
||||
# iterate over y coords and find closest points
|
||||
|
|
@ -308,10 +183,6 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def set_spanning(self):
|
||||
"""Sets a cell's spanning_h or spanning_v attribute to True
|
||||
depending on whether the cell spans/extends horizontally or
|
||||
vertically.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
for c in range(len(self.cols)):
|
||||
bound = self.cells[r][c].get_bounded_edges()
|
||||
|
|
@ -351,13 +222,6 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def get_list(self):
|
||||
"""Returns a two-dimensional list of text assigned to each
|
||||
cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ar : list
|
||||
"""
|
||||
ar = []
|
||||
for r in range(len(self.rows)):
|
||||
ar.append([self.cells[r][c].get_text().strip()
|
||||
|
|
@ -367,3 +231,75 @@ class Table(object):
|
|||
|
||||
class TableSet(object):
|
||||
pass
|
||||
|
||||
|
||||
class FileHandler(object):
|
||||
def __init__(self, filename, pages='1'):
|
||||
self.filename = filename
|
||||
if not self.filename.endswith('.pdf'):
|
||||
raise TypeError("File format not supported.")
|
||||
self.pages = __get_pages(pages)
|
||||
self.temp = tempfile.mkdtemp()
|
||||
|
||||
@staticmethod
|
||||
def __get_pages(filename, pages):
|
||||
p = {}
|
||||
if pages == '1':
|
||||
p.append({'start': 1, 'end': 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||
if pages == 'all':
|
||||
p.append({'start': 1, 'end': infile.getNumPages()})
|
||||
else:
|
||||
for r in pages.split(','):
|
||||
if '-' in r:
|
||||
a, b = r.split('-')
|
||||
if b == 'end':
|
||||
b = infile.getNumPages()
|
||||
p.append({'start': int(a), 'end': int(b)})
|
||||
else:
|
||||
p.append({'start': int(r), 'end': int(r)})
|
||||
return p
|
||||
|
||||
@staticmethod
|
||||
def __save_page(filename, page, temp):
|
||||
with open(filename, 'rb') as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||
fname, fext = os.path.splitext(fpath)
|
||||
p = infile.getPage(page - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(fpath)
|
||||
# fix rotated pdf
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
if rotation != '':
|
||||
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
|
||||
os.rename(fpath, fpath_new)
|
||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||
outfile = PdfFileWriter()
|
||||
p = infile.getPage(0)
|
||||
if rotation == 'left':
|
||||
p.rotateClockwise(90)
|
||||
elif rotation == 'right':
|
||||
p.rotateCounterClockwise(90)
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
|
||||
def parse(self):
|
||||
for p in self.pages:
|
||||
__save_page(self.filename, p, self.temp)
|
||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pagenos]
|
||||
tables = {}
|
||||
for p in pages:
|
||||
table = self.parser.get_tables(p)
|
||||
if table is not None:
|
||||
tables.update(table)
|
||||
return tables
|
||||
268
camelot/pdf.py
268
camelot/pdf.py
|
|
@ -1,268 +0,0 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import itertools
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
|
||||
import cv2
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||
|
||||
|
||||
__all__ = ['Pdf']
|
||||
|
||||
|
||||
def _parse_page_numbers(pagenos):
|
||||
"""Converts list of dicts to list of ints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pagenos : list
|
||||
List of dicts representing page ranges. A dict must have only
|
||||
two keys named 'start' and 'end' having int as their value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
page_numbers : list
|
||||
List of int page numbers.
|
||||
"""
|
||||
page_numbers = []
|
||||
for p in pagenos:
|
||||
page_numbers.extend(range(p['start'], p['end'] + 1))
|
||||
page_numbers = sorted(set(page_numbers))
|
||||
return page_numbers
|
||||
|
||||
|
||||
def _save_page(temp, pdfname, pageno):
|
||||
with open(pdfname, 'rb') as pdffile:
|
||||
infile = PdfFileReader(pdffile, strict=False)
|
||||
sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno))
|
||||
sp_name, sp_ext = os.path.splitext(sp_path)
|
||||
page = infile.getPage(pageno - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(page)
|
||||
with open(sp_path, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(sp_path)
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
if rotation != '':
|
||||
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
|
||||
os.rename(sp_path, sp_new_path)
|
||||
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
|
||||
strict=False)
|
||||
sp_out = PdfFileWriter()
|
||||
sp_page = sp_in.getPage(0)
|
||||
if rotation == 'left':
|
||||
sp_page.rotateClockwise(90)
|
||||
elif rotation == 'right':
|
||||
sp_page.rotateCounterClockwise(90)
|
||||
sp_out.addPage(sp_page)
|
||||
with open(sp_path, 'wb') as pdf_out:
|
||||
sp_out.write(pdf_out)
|
||||
|
||||
|
||||
class Pdf:
|
||||
"""Pdf manager.
|
||||
Handles all operations like temp directory creation, splitting file
|
||||
into single page pdfs, running extraction using multiple processes
|
||||
and removing the temp directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
extractor : object
|
||||
camelot.stream.Stream or camelot.lattice.Lattice extractor
|
||||
object.
|
||||
|
||||
pdfname : string
|
||||
Path to pdf file.
|
||||
|
||||
pagenos : list
|
||||
List of dicts representing page ranges. A dict must have only
|
||||
two keys named 'start' and 'end' having int as their value.
|
||||
(optional, default: [{'start': 1, 'end': 1}])
|
||||
|
||||
parallel : bool
|
||||
Whether or not to run using multiple processes.
|
||||
(optional, default: False)
|
||||
|
||||
clean : bool
|
||||
Whether or not to remove the temp directory.
|
||||
(optional, default: False)
|
||||
"""
|
||||
|
||||
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||
parallel=False, clean=False):
|
||||
|
||||
self.extractor = extractor
|
||||
self.pdfname = pdfname
|
||||
if not self.pdfname.endswith('.pdf'):
|
||||
raise TypeError("File format not supported.")
|
||||
self.pagenos = _parse_page_numbers(pagenos)
|
||||
self.parallel = parallel
|
||||
if self.parallel:
|
||||
self.cpu_count = mp.cpu_count()
|
||||
self.pool = mp.Pool(processes=self.cpu_count)
|
||||
self.clean = clean
|
||||
self.temp = tempfile.mkdtemp()
|
||||
|
||||
def split(self):
|
||||
"""Splits file into single page pdfs.
|
||||
"""
|
||||
if self.parallel:
|
||||
pfunc = partial(_save_page, self.temp, self.pdfname)
|
||||
self.pool.map(pfunc, self.pagenos)
|
||||
else:
|
||||
for p in self.pagenos:
|
||||
_save_page(self.temp, self.pdfname, p)
|
||||
|
||||
|
||||
def extract(self):
|
||||
"""Runs table extraction by calling extractor.get_tables
|
||||
on all single page pdfs.
|
||||
"""
|
||||
self.split()
|
||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pagenos]
|
||||
if self.parallel:
|
||||
tables = self.pool.map(self.extractor.get_tables, pages)
|
||||
tables = {k: v for d in tables if d is not None for k, v in d.items()}
|
||||
else:
|
||||
tables = {}
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_text = []
|
||||
elif self.extractor.method in ['lattice', 'ocrl']:
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_images = []
|
||||
self.debug_segments = []
|
||||
self.debug_tables = []
|
||||
elif self.extractor.method == 'ocrs':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_images = []
|
||||
for p in pages:
|
||||
table = self.extractor.get_tables(p)
|
||||
if table is not None:
|
||||
tables.update(table)
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug_text.append(self.extractor.debug_text)
|
||||
elif self.extractor.method in ['lattice', 'ocr']:
|
||||
self.debug_images.append(self.extractor.debug_images)
|
||||
self.debug_segments.append(self.extractor.debug_segments)
|
||||
self.debug_tables.append(self.extractor.debug_tables)
|
||||
elif self.extractor.method == 'ocrs':
|
||||
self.debug_images.append(self.extractor.debug_images)
|
||||
if self.clean:
|
||||
self.remove_tempdir()
|
||||
return tables
|
||||
|
||||
def remove_tempdir(self):
|
||||
"""Removes temporary directory that was created to save single
|
||||
page pdfs and their images.
|
||||
"""
|
||||
shutil.rmtree(self.temp)
|
||||
|
||||
def debug_plot(self):
|
||||
"""Generates a matplotlib plot based on the selected extractor
|
||||
debug option.
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
if self.debug is True:
|
||||
if hasattr(self, 'debug_text'):
|
||||
for text in self.debug_text:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in text:
|
||||
xs.extend([t[0], t[1]])
|
||||
ys.extend([t[2], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1]
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
elif hasattr(self, 'debug_images'):
|
||||
for img in self.debug_images:
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif self.debug == 'contour':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'joint':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
max_x, max_y = max(x_coord), max(y_coord)
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'line':
|
||||
try:
|
||||
for v_s, h_s in self.debug_segments:
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'table':
|
||||
try:
|
||||
for tables in self.debug_tables:
|
||||
for table in tables:
|
||||
for r in range(len(table.rows)):
|
||||
for c in range(len(table.cols)):
|
||||
if table.cells[r][c].left:
|
||||
plt.plot([table.cells[r][c].lb[0],
|
||||
table.cells[r][c].lt[0]],
|
||||
[table.cells[r][c].lb[1],
|
||||
table.cells[r][c].lt[1]])
|
||||
if table.cells[r][c].right:
|
||||
plt.plot([table.cells[r][c].rb[0],
|
||||
table.cells[r][c].rt[0]],
|
||||
[table.cells[r][c].rb[1],
|
||||
table.cells[r][c].rt[1]])
|
||||
if table.cells[r][c].top:
|
||||
plt.plot([table.cells[r][c].lt[0],
|
||||
table.cells[r][c].rt[0]],
|
||||
[table.cells[r][c].lt[1],
|
||||
table.cells[r][c].rt[1]])
|
||||
if table.cells[r][c].bottom:
|
||||
plt.plot([table.cells[r][c].lb[0],
|
||||
table.cells[r][c].rb[0]],
|
||||
[table.cells[r][c].lb[1],
|
||||
table.cells[r][c].rb[1]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
else:
|
||||
raise UserWarning("This method can only be called after"
|
||||
" debug has been specified.")
|
||||
Loading…
Reference in New Issue