Add OCR support for image based pdfs with lines

* Cosmits

* Remove unnecessary kwargs

* Direct ghostscript call output to /dev/null

* Change char_margin's default value

* Add image attribute in Table and Cell

* Add OCR

* Fix coordinates

* Add table_area

* Add ocr options to cli

* Direct ghostscript call output to /dev/null

* Add ocr dostring

* Add requirements

* Update README
pull/2/head
Vinayak Mehta 2017-01-07 16:37:56 +05:30 committed by GitHub
parent 70f626373b
commit 970256e19d
8 changed files with 246 additions and 3 deletions

View File

@ -57,6 +57,10 @@ Currently, camelot works under Python 2.7.
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php). The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php).
### Optional
You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english.
## Installation ## Installation
Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by

View File

@ -1,3 +1,3 @@
__version__ = '1.0.0' __version__ = '1.0.0'
__all__ = ['pdf', 'lattice', 'stream'] __all__ = ['pdf', 'lattice', 'stream', 'ocr']

View File

@ -79,6 +79,7 @@ class Cell:
self.text = '' self.text = ''
self.spanning_h = False self.spanning_h = False
self.spanning_v = False self.spanning_v = False
self.image = None
def add_text(self, text): def add_text(self, text):
"""Adds text to cell. """Adds text to cell.

148
camelot/ocr.py 100644
View File

@ -0,0 +1,148 @@
import os
import subprocess
import pyocr
from PIL import Image
from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .utils import merge_close_values, encode_list
class OCR:
"""Uses optical character recognition to get text out of image based pdfs.
Currently works only on pdfs with lines.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze.
(optional, default: None)
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
dpi : int
Dots per inch.
(optional, default: 300)
lang : string
Language to be used for OCR.
(optional, default: 'eng')
scale : int
Used to divide the height/width of a pdf to get a structuring
element for image processing.
(optional, default: 15)
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15,
debug=None):
self.method = 'ocr'
self.table_area = table_area
self.mtol = mtol
self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi
self.lang = lang
self.scale = scale
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
pdfname
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale)
if self.table_area is not None:
areas = []
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vmask, hmask)
else:
contours = find_table_contours(vmask, hmask)
table_bbox = find_table_joints(contours, vmask, hmask)
if self.debug:
self.debug_images = (img, table_bbox)
self.debug_segments = (v_segments, h_segments)
self.debug_tables = []
if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(table_bbox)
page = {}
tables = {}
table_no = 0
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
table_data = {}
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no])
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
table = Table(cols, rows)
if self.debug:
self.debug_tables.append(table)
table.image = img[k[3]:k[1],k[0]:k[2]]
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = img[y1:y2,x1:x2]
text = self.tool.image_to_string(
Image.fromarray(table.cells[i][j].image),
lang=self.lang,
builder=pyocr.builders.TextBuilder()
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
if self.debug:
return None
return page

View File

@ -126,7 +126,7 @@ class Pdf:
if self.extractor.method == 'stream': if self.extractor.method == 'stream':
self.debug = self.extractor.debug self.debug = self.extractor.debug
self.debug_text = [] self.debug_text = []
elif self.extractor.method == 'lattice': elif self.extractor.method in ['lattice', 'ocr']:
self.debug = self.extractor.debug self.debug = self.extractor.debug
self.debug_images = [] self.debug_images = []
self.debug_segments = [] self.debug_segments = []
@ -138,7 +138,7 @@ class Pdf:
if self.extractor.debug: if self.extractor.debug:
if self.extractor.method == 'stream': if self.extractor.method == 'stream':
self.debug_text.append(self.extractor.debug_text) self.debug_text.append(self.extractor.debug_text)
elif self.extractor.method == 'lattice': elif self.extractor.method in ['lattice', 'ocr']:
self.debug_images.append(self.extractor.debug_images) self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments) self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables) self.debug_tables.append(self.extractor.debug_tables)

View File

@ -34,6 +34,7 @@ class Table:
self.cells = [[Cell(c[0], r[1], c[1], r[0]) self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows] for c in cols] for r in rows]
self.nocont_ = 0 self.nocont_ = 0
self.image = None
def set_all_edges(self): def set_all_edges(self):
"""Sets all table edges to True. """Sets all table edges to True.

View File

@ -3,5 +3,7 @@ matplotlib
nose nose
pdfminer pdfminer
pyexcel-xlsx pyexcel-xlsx
Pillow
pyocr
PyPDF2 PyPDF2
Sphinx Sphinx

View File

@ -17,6 +17,7 @@ from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf from camelot.pdf import Pdf
from camelot.lattice import Lattice from camelot.lattice import Lattice
from camelot.stream import Stream from camelot.stream import Stream
from camelot.ocr import OCR
doc = """ doc = """
@ -52,6 +53,7 @@ options:
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
stream Looks for spaces between data. stream Looks for spaces between data.
ocr Looks for lines in image based pdfs.
See 'camelot <method> -h' for more information on a specific method. See 'camelot <method> -h' for more information on a specific method.
""" """
@ -101,6 +103,26 @@ options:
""" """
ocr_doc = """
OCR method looks for lines in image based pdfs.
usage:
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
def plot_table_barchart(r, c, p, pno, tno): def plot_table_barchart(r, c, p, pno, tno):
row_idx = [i + 1 for i, row in enumerate(r)] row_idx = [i + 1 for i, row in enumerate(r)]
col_idx = [i + 1 for i, col in enumerate(c)] col_idx = [i + 1 for i, col in enumerate(c)]
@ -315,6 +337,8 @@ if __name__ == '__main__':
args.update(docopt(lattice_doc, argv=argv)) args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv)) args.update(docopt(stream_doc, argv=argv))
elif args['<method>'] == 'ocr':
args.update(docopt(ocr_doc, argv=argv))
vprint = print if args['--verbose'] else lambda *a, **k: None vprint = print if args['--verbose'] else lambda *a, **k: None
filename = args['<file>'] filename = args['<file>']
@ -487,6 +511,69 @@ if __name__ == '__main__':
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logging.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'ocr':
try:
tarea = args['--tarea'] if args['--tarea'] else None
mtol = [int(m) for m in args['--mtol']]
manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']),
lang=args['--lang'], scale=int(args['--scale']),
debug=args['--debug']),
filename,
pagenos=p,
parallel=args['--parallel'],
clean=True)
data = manager.extract()
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
sys.exit()
if args['--debug']: if args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.") print("See 'camelot <method> -h' for various parameters you can tweak.")