Add OCR support for image based pdfs with lines
* Cosmits * Remove unnecessary kwargs * Direct ghostscript call output to /dev/null * Change char_margin's default value * Add image attribute in Table and Cell * Add OCR * Fix coordinates * Add table_area * Add ocr options to cli * Direct ghostscript call output to /dev/null * Add ocr dostring * Add requirements * Update READMEpull/2/head
parent
70f626373b
commit
970256e19d
|
|
@ -57,6 +57,10 @@ Currently, camelot works under Python 2.7.
|
||||||
|
|
||||||
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php).
|
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php).
|
||||||
|
|
||||||
|
### Optional
|
||||||
|
|
||||||
|
You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by
|
Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
__version__ = '1.0.0'
|
__version__ = '1.0.0'
|
||||||
|
|
||||||
__all__ = ['pdf', 'lattice', 'stream']
|
__all__ = ['pdf', 'lattice', 'stream', 'ocr']
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,7 @@ class Cell:
|
||||||
self.text = ''
|
self.text = ''
|
||||||
self.spanning_h = False
|
self.spanning_h = False
|
||||||
self.spanning_v = False
|
self.spanning_v = False
|
||||||
|
self.image = None
|
||||||
|
|
||||||
def add_text(self, text):
|
def add_text(self, text):
|
||||||
"""Adds text to cell.
|
"""Adds text to cell.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import pyocr
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from .table import Table
|
||||||
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
|
find_table_joints)
|
||||||
|
from .utils import merge_close_values, encode_list
|
||||||
|
|
||||||
|
|
||||||
|
class OCR:
|
||||||
|
"""Uses optical character recognition to get text out of image based pdfs.
|
||||||
|
Currently works only on pdfs with lines.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_area : list
|
||||||
|
List of strings of the form x1,y1,x2,y2 where
|
||||||
|
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
||||||
|
coordinate space, denoting table areas to analyze.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
|
mtol : list
|
||||||
|
List of ints specifying m-tolerance parameters.
|
||||||
|
(optional, default: [2])
|
||||||
|
|
||||||
|
dpi : int
|
||||||
|
Dots per inch.
|
||||||
|
(optional, default: 300)
|
||||||
|
|
||||||
|
lang : string
|
||||||
|
Language to be used for OCR.
|
||||||
|
(optional, default: 'eng')
|
||||||
|
|
||||||
|
scale : int
|
||||||
|
Used to divide the height/width of a pdf to get a structuring
|
||||||
|
element for image processing.
|
||||||
|
(optional, default: 15)
|
||||||
|
|
||||||
|
debug : string
|
||||||
|
{'contour', 'line', 'joint', 'table'}
|
||||||
|
Set to one of the above values to generate a matplotlib plot
|
||||||
|
of detected contours, lines, joints and the table generated.
|
||||||
|
(optional, default: None)
|
||||||
|
"""
|
||||||
|
def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15,
|
||||||
|
debug=None):
|
||||||
|
|
||||||
|
self.method = 'ocr'
|
||||||
|
self.table_area = table_area
|
||||||
|
self.mtol = mtol
|
||||||
|
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||||
|
self.dpi = dpi
|
||||||
|
self.lang = lang
|
||||||
|
self.scale = scale
|
||||||
|
self.debug = debug
|
||||||
|
|
||||||
|
def get_tables(self, pdfname):
|
||||||
|
if self.tool is None:
|
||||||
|
return None
|
||||||
|
bname, __ = os.path.splitext(pdfname)
|
||||||
|
imagename = ''.join([bname, '.png'])
|
||||||
|
|
||||||
|
gs_call = [
|
||||||
|
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||||
|
pdfname
|
||||||
|
]
|
||||||
|
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||||
|
gs_call.insert(0, "gs")
|
||||||
|
else:
|
||||||
|
gs_call.insert(0, "gsc")
|
||||||
|
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
|
img, threshold = adaptive_threshold(imagename)
|
||||||
|
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||||
|
scale=self.scale)
|
||||||
|
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||||
|
scale=self.scale)
|
||||||
|
|
||||||
|
if self.table_area is not None:
|
||||||
|
areas = []
|
||||||
|
for area in self.table_area:
|
||||||
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
x1 = int(x1)
|
||||||
|
y1 = int(y1)
|
||||||
|
x2 = int(x2)
|
||||||
|
y2 = int(y2)
|
||||||
|
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||||
|
table_bbox = find_table_joints(areas, vmask, hmask)
|
||||||
|
else:
|
||||||
|
contours = find_table_contours(vmask, hmask)
|
||||||
|
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||||
|
|
||||||
|
if self.debug:
|
||||||
|
self.debug_images = (img, table_bbox)
|
||||||
|
self.debug_segments = (v_segments, h_segments)
|
||||||
|
self.debug_tables = []
|
||||||
|
|
||||||
|
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||||
|
self.mtol = self.mtol * len(table_bbox)
|
||||||
|
|
||||||
|
page = {}
|
||||||
|
tables = {}
|
||||||
|
table_no = 0
|
||||||
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
||||||
|
table_data = {}
|
||||||
|
cols, rows = zip(*table_bbox[k])
|
||||||
|
cols, rows = list(cols), list(rows)
|
||||||
|
cols.extend([k[0], k[2]])
|
||||||
|
rows.extend([k[1], k[3]])
|
||||||
|
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
|
||||||
|
rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no])
|
||||||
|
cols = [(cols[i], cols[i + 1])
|
||||||
|
for i in range(0, len(cols) - 1)]
|
||||||
|
rows = [(rows[i], rows[i + 1])
|
||||||
|
for i in range(0, len(rows) - 1)]
|
||||||
|
table = Table(cols, rows)
|
||||||
|
if self.debug:
|
||||||
|
self.debug_tables.append(table)
|
||||||
|
table.image = img[k[3]:k[1],k[0]:k[2]]
|
||||||
|
for i in range(len(table.cells)):
|
||||||
|
for j in range(len(table.cells[i])):
|
||||||
|
x1 = int(table.cells[i][j].x1)
|
||||||
|
y1 = int(table.cells[i][j].y1)
|
||||||
|
x2 = int(table.cells[i][j].x2)
|
||||||
|
y2 = int(table.cells[i][j].y2)
|
||||||
|
table.cells[i][j].image = img[y1:y2,x1:x2]
|
||||||
|
text = self.tool.image_to_string(
|
||||||
|
Image.fromarray(table.cells[i][j].image),
|
||||||
|
lang=self.lang,
|
||||||
|
builder=pyocr.builders.TextBuilder()
|
||||||
|
)
|
||||||
|
table.cells[i][j].add_text(text)
|
||||||
|
ar = table.get_list()
|
||||||
|
ar.reverse()
|
||||||
|
ar = encode_list(ar)
|
||||||
|
table_data['data'] = ar
|
||||||
|
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||||
|
table_no += 1
|
||||||
|
page[os.path.basename(bname)] = tables
|
||||||
|
|
||||||
|
if self.debug:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
@ -126,7 +126,7 @@ class Pdf:
|
||||||
if self.extractor.method == 'stream':
|
if self.extractor.method == 'stream':
|
||||||
self.debug = self.extractor.debug
|
self.debug = self.extractor.debug
|
||||||
self.debug_text = []
|
self.debug_text = []
|
||||||
elif self.extractor.method == 'lattice':
|
elif self.extractor.method in ['lattice', 'ocr']:
|
||||||
self.debug = self.extractor.debug
|
self.debug = self.extractor.debug
|
||||||
self.debug_images = []
|
self.debug_images = []
|
||||||
self.debug_segments = []
|
self.debug_segments = []
|
||||||
|
|
@ -138,7 +138,7 @@ class Pdf:
|
||||||
if self.extractor.debug:
|
if self.extractor.debug:
|
||||||
if self.extractor.method == 'stream':
|
if self.extractor.method == 'stream':
|
||||||
self.debug_text.append(self.extractor.debug_text)
|
self.debug_text.append(self.extractor.debug_text)
|
||||||
elif self.extractor.method == 'lattice':
|
elif self.extractor.method in ['lattice', 'ocr']:
|
||||||
self.debug_images.append(self.extractor.debug_images)
|
self.debug_images.append(self.extractor.debug_images)
|
||||||
self.debug_segments.append(self.extractor.debug_segments)
|
self.debug_segments.append(self.extractor.debug_segments)
|
||||||
self.debug_tables.append(self.extractor.debug_tables)
|
self.debug_tables.append(self.extractor.debug_tables)
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ class Table:
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||||
for c in cols] for r in rows]
|
for c in cols] for r in rows]
|
||||||
self.nocont_ = 0
|
self.nocont_ = 0
|
||||||
|
self.image = None
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_all_edges(self):
|
||||||
"""Sets all table edges to True.
|
"""Sets all table edges to True.
|
||||||
|
|
|
||||||
|
|
@ -3,5 +3,7 @@ matplotlib
|
||||||
nose
|
nose
|
||||||
pdfminer
|
pdfminer
|
||||||
pyexcel-xlsx
|
pyexcel-xlsx
|
||||||
|
Pillow
|
||||||
|
pyocr
|
||||||
PyPDF2
|
PyPDF2
|
||||||
Sphinx
|
Sphinx
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ from PyPDF2 import PdfFileReader
|
||||||
from camelot.pdf import Pdf
|
from camelot.pdf import Pdf
|
||||||
from camelot.lattice import Lattice
|
from camelot.lattice import Lattice
|
||||||
from camelot.stream import Stream
|
from camelot.stream import Stream
|
||||||
|
from camelot.ocr import OCR
|
||||||
|
|
||||||
|
|
||||||
doc = """
|
doc = """
|
||||||
|
|
@ -52,6 +53,7 @@ options:
|
||||||
camelot methods:
|
camelot methods:
|
||||||
lattice Looks for lines between data.
|
lattice Looks for lines between data.
|
||||||
stream Looks for spaces between data.
|
stream Looks for spaces between data.
|
||||||
|
ocr Looks for lines in image based pdfs.
|
||||||
|
|
||||||
See 'camelot <method> -h' for more information on a specific method.
|
See 'camelot <method> -h' for more information on a specific method.
|
||||||
"""
|
"""
|
||||||
|
|
@ -101,6 +103,26 @@ options:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
ocr_doc = """
|
||||||
|
OCR method looks for lines in image based pdfs.
|
||||||
|
|
||||||
|
usage:
|
||||||
|
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
|
||||||
|
|
||||||
|
options:
|
||||||
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
|
which are very close. [default: 2]
|
||||||
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
|
[default: 300]
|
||||||
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
|
smaller lines being detected. [default: 15]
|
||||||
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
|
(contour,line,joint,table) Example: -d table
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def plot_table_barchart(r, c, p, pno, tno):
|
def plot_table_barchart(r, c, p, pno, tno):
|
||||||
row_idx = [i + 1 for i, row in enumerate(r)]
|
row_idx = [i + 1 for i, row in enumerate(r)]
|
||||||
col_idx = [i + 1 for i, col in enumerate(c)]
|
col_idx = [i + 1 for i, col in enumerate(c)]
|
||||||
|
|
@ -315,6 +337,8 @@ if __name__ == '__main__':
|
||||||
args.update(docopt(lattice_doc, argv=argv))
|
args.update(docopt(lattice_doc, argv=argv))
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
args.update(docopt(stream_doc, argv=argv))
|
args.update(docopt(stream_doc, argv=argv))
|
||||||
|
elif args['<method>'] == 'ocr':
|
||||||
|
args.update(docopt(ocr_doc, argv=argv))
|
||||||
|
|
||||||
vprint = print if args['--verbose'] else lambda *a, **k: None
|
vprint = print if args['--verbose'] else lambda *a, **k: None
|
||||||
filename = args['<file>']
|
filename = args['<file>']
|
||||||
|
|
@ -487,6 +511,69 @@ if __name__ == '__main__':
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(e.message, exc_info=True)
|
logging.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
elif args['<method>'] == 'ocr':
|
||||||
|
try:
|
||||||
|
tarea = args['--tarea'] if args['--tarea'] else None
|
||||||
|
mtol = [int(m) for m in args['--mtol']]
|
||||||
|
manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']),
|
||||||
|
lang=args['--lang'], scale=int(args['--scale']),
|
||||||
|
debug=args['--debug']),
|
||||||
|
filename,
|
||||||
|
pagenos=p,
|
||||||
|
parallel=args['--parallel'],
|
||||||
|
clean=True)
|
||||||
|
data = manager.extract()
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
vprint("Finished processing in", processing_time, "seconds")
|
||||||
|
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||||
|
|
||||||
|
if args['--plot']:
|
||||||
|
if args['--output']:
|
||||||
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||||
|
plot_type = args['--plot'].split(',')
|
||||||
|
if 'page' in plot_type:
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
plot_table_barchart(table['r_nempty_cells'],
|
||||||
|
table['c_nempty_cells'],
|
||||||
|
table['empty_p'],
|
||||||
|
page_number,
|
||||||
|
table_number)
|
||||||
|
|
||||||
|
if 'all' in plot_type:
|
||||||
|
plot_all_barchart(data, pngname)
|
||||||
|
|
||||||
|
if 'rc' in plot_type:
|
||||||
|
plot_rc_piechart(data, pngname)
|
||||||
|
|
||||||
|
if args['--print-stats']:
|
||||||
|
print_stats(data, processing_time)
|
||||||
|
|
||||||
|
if args['--save-stats']:
|
||||||
|
if args['--output']:
|
||||||
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||||
|
with open(scorename, 'w') as score_file:
|
||||||
|
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
||||||
|
''.join([page_number, '_', table_number]),
|
||||||
|
table['nrows'],
|
||||||
|
table['ncols'],
|
||||||
|
table['empty_p'],
|
||||||
|
table['line_p'],
|
||||||
|
table['text_p'],
|
||||||
|
table['score']))
|
||||||
|
if args['--debug']:
|
||||||
|
manager.debug_plot()
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e.message, exc_info=True)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
if args['--debug']:
|
if args['--debug']:
|
||||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue