From 970256e19d8a05463888321eb99a1eb4908f8fed Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sat, 7 Jan 2017 16:37:56 +0530 Subject: [PATCH] Add OCR support for image based pdfs with lines * Cosmits * Remove unnecessary kwargs * Direct ghostscript call output to /dev/null * Change char_margin's default value * Add image attribute in Table and Cell * Add OCR * Fix coordinates * Add table_area * Add ocr options to cli * Direct ghostscript call output to /dev/null * Add ocr dostring * Add requirements * Update README --- README.md | 4 ++ camelot/__init__.py | 2 +- camelot/cell.py | 1 + camelot/ocr.py | 148 ++++++++++++++++++++++++++++++++++++++++++++ camelot/pdf.py | 4 +- camelot/table.py | 1 + requirements.txt | 2 + tools/camelot | 87 ++++++++++++++++++++++++++ 8 files changed, 246 insertions(+), 3 deletions(-) create mode 100644 camelot/ocr.py diff --git a/README.md b/README.md index b1125d8..4c9e97e 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,10 @@ Currently, camelot works under Python 2.7. The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php). +### Optional + +You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english. + ## Installation Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by diff --git a/camelot/__init__.py b/camelot/__init__.py index a3f6f78..ed19a28 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,3 +1,3 @@ __version__ = '1.0.0' -__all__ = ['pdf', 'lattice', 'stream'] +__all__ = ['pdf', 'lattice', 'stream', 'ocr'] diff --git a/camelot/cell.py b/camelot/cell.py index 159f571..8dfe8d3 100644 --- a/camelot/cell.py +++ b/camelot/cell.py @@ -79,6 +79,7 @@ class Cell: self.text = '' self.spanning_h = False self.spanning_v = False + self.image = None def add_text(self, text): """Adds text to cell. diff --git a/camelot/ocr.py b/camelot/ocr.py new file mode 100644 index 0000000..57ddb54 --- /dev/null +++ b/camelot/ocr.py @@ -0,0 +1,148 @@ +import os +import subprocess + +import pyocr +from PIL import Image + +from .table import Table +from .imgproc import (adaptive_threshold, find_lines, find_table_contours, + find_table_joints) +from .utils import merge_close_values, encode_list + + +class OCR: + """Uses optical character recognition to get text out of image based pdfs. + Currently works only on pdfs with lines. + + Parameters + ---------- + table_area : list + List of strings of the form x1,y1,x2,y2 where + (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's + coordinate space, denoting table areas to analyze. + (optional, default: None) + + mtol : list + List of ints specifying m-tolerance parameters. + (optional, default: [2]) + + dpi : int + Dots per inch. + (optional, default: 300) + + lang : string + Language to be used for OCR. + (optional, default: 'eng') + + scale : int + Used to divide the height/width of a pdf to get a structuring + element for image processing. + (optional, default: 15) + + debug : string + {'contour', 'line', 'joint', 'table'} + Set to one of the above values to generate a matplotlib plot + of detected contours, lines, joints and the table generated. + (optional, default: None) + """ + def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15, + debug=None): + + self.method = 'ocr' + self.table_area = table_area + self.mtol = mtol + self.tool = pyocr.get_available_tools()[0] # fix this + self.dpi = dpi + self.lang = lang + self.scale = scale + self.debug = debug + + def get_tables(self, pdfname): + if self.tool is None: + return None + bname, __ = os.path.splitext(pdfname) + imagename = ''.join([bname, '.png']) + + gs_call = [ + "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), + pdfname + ] + if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): + gs_call.insert(0, "gs") + else: + gs_call.insert(0, "gsc") + subprocess.call(gs_call, stdout=open(os.devnull, 'w'), + stderr=subprocess.STDOUT) + + img, threshold = adaptive_threshold(imagename) + vmask, v_segments = find_lines(threshold, direction='vertical', + scale=self.scale) + hmask, h_segments = find_lines(threshold, direction='horizontal', + scale=self.scale) + + if self.table_area is not None: + areas = [] + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = int(x1) + y1 = int(y1) + x2 = int(x2) + y2 = int(y2) + areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + table_bbox = find_table_joints(areas, vmask, hmask) + else: + contours = find_table_contours(vmask, hmask) + table_bbox = find_table_joints(contours, vmask, hmask) + + if self.debug: + self.debug_images = (img, table_bbox) + self.debug_segments = (v_segments, h_segments) + self.debug_tables = [] + + if len(self.mtol) == 1 and self.mtol[0] == 2: + self.mtol = self.mtol * len(table_bbox) + + page = {} + tables = {} + table_no = 0 + for k in sorted(table_bbox.keys(), key=lambda x: x[1]): + table_data = {} + cols, rows = zip(*table_bbox[k]) + cols, rows = list(cols), list(rows) + cols.extend([k[0], k[2]]) + rows.extend([k[1], k[3]]) + cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) + rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no]) + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + table = Table(cols, rows) + if self.debug: + self.debug_tables.append(table) + table.image = img[k[3]:k[1],k[0]:k[2]] + for i in range(len(table.cells)): + for j in range(len(table.cells[i])): + x1 = int(table.cells[i][j].x1) + y1 = int(table.cells[i][j].y1) + x2 = int(table.cells[i][j].x2) + y2 = int(table.cells[i][j].y2) + table.cells[i][j].image = img[y1:y2,x1:x2] + text = self.tool.image_to_string( + Image.fromarray(table.cells[i][j].image), + lang=self.lang, + builder=pyocr.builders.TextBuilder() + ) + table.cells[i][j].add_text(text) + ar = table.get_list() + ar.reverse() + ar = encode_list(ar) + table_data['data'] = ar + tables['table-{0}'.format(table_no + 1)] = table_data + table_no += 1 + page[os.path.basename(bname)] = tables + + if self.debug: + return None + + return page \ No newline at end of file diff --git a/camelot/pdf.py b/camelot/pdf.py index 11cc6b4..85c2dbc 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -126,7 +126,7 @@ class Pdf: if self.extractor.method == 'stream': self.debug = self.extractor.debug self.debug_text = [] - elif self.extractor.method == 'lattice': + elif self.extractor.method in ['lattice', 'ocr']: self.debug = self.extractor.debug self.debug_images = [] self.debug_segments = [] @@ -138,7 +138,7 @@ class Pdf: if self.extractor.debug: if self.extractor.method == 'stream': self.debug_text.append(self.extractor.debug_text) - elif self.extractor.method == 'lattice': + elif self.extractor.method in ['lattice', 'ocr']: self.debug_images.append(self.extractor.debug_images) self.debug_segments.append(self.extractor.debug_segments) self.debug_tables.append(self.extractor.debug_tables) diff --git a/camelot/table.py b/camelot/table.py index 0978e7e..8549300 100644 --- a/camelot/table.py +++ b/camelot/table.py @@ -34,6 +34,7 @@ class Table: self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] self.nocont_ = 0 + self.image = None def set_all_edges(self): """Sets all table edges to True. diff --git a/requirements.txt b/requirements.txt index 6dda062..826e271 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,7 @@ matplotlib nose pdfminer pyexcel-xlsx +Pillow +pyocr PyPDF2 Sphinx diff --git a/tools/camelot b/tools/camelot index 02fa233..837072f 100755 --- a/tools/camelot +++ b/tools/camelot @@ -17,6 +17,7 @@ from PyPDF2 import PdfFileReader from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream +from camelot.ocr import OCR doc = """ @@ -52,6 +53,7 @@ options: camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. + ocr Looks for lines in image based pdfs. See 'camelot -h' for more information on a specific method. """ @@ -101,6 +103,26 @@ options: """ +ocr_doc = """ +OCR method looks for lines in image based pdfs. + +usage: + camelot ocr [-t ] [-m ] [options] [--] + +options: + -t, --tarea Specific table areas to analyze. + -m, --mtol Tolerance to account for when merging lines + which are very close. [default: 2] + -D, --dpi Dots per inch, specify image quality to be used for OCR. + [default: 300] + -l, --lang Specify language to be used for OCR. [default: eng] + -s, --scale Scaling factor. Large scaling factor leads to + smaller lines being detected. [default: 15] + -d, --debug Debug by visualizing pdf geometry. + (contour,line,joint,table) Example: -d table +""" + + def plot_table_barchart(r, c, p, pno, tno): row_idx = [i + 1 for i, row in enumerate(r)] col_idx = [i + 1 for i, col in enumerate(c)] @@ -315,6 +337,8 @@ if __name__ == '__main__': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) + elif args[''] == 'ocr': + args.update(docopt(ocr_doc, argv=argv)) vprint = print if args['--verbose'] else lambda *a, **k: None filename = args[''] @@ -487,6 +511,69 @@ if __name__ == '__main__': except Exception as e: logging.exception(e.message, exc_info=True) sys.exit() + elif args[''] == 'ocr': + try: + tarea = args['--tarea'] if args['--tarea'] else None + mtol = [int(m) for m in args['--mtol']] + manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']), + lang=args['--lang'], scale=int(args['--scale']), + debug=args['--debug']), + filename, + pagenos=p, + parallel=args['--parallel'], + clean=True) + data = manager.extract() + + processing_time = time.time() - start_time + vprint("Finished processing in", processing_time, "seconds") + logging.info("Finished processing in " + str(processing_time) + " seconds") + + if args['--plot']: + if args['--output']: + pngname = os.path.join(args['--output'], os.path.basename(pngname)) + plot_type = args['--plot'].split(',') + if 'page' in plot_type: + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + plot_table_barchart(table['r_nempty_cells'], + table['c_nempty_cells'], + table['empty_p'], + page_number, + table_number) + + if 'all' in plot_type: + plot_all_barchart(data, pngname) + + if 'rc' in plot_type: + plot_rc_piechart(data, pngname) + + if args['--print-stats']: + print_stats(data, processing_time) + + if args['--save-stats']: + if args['--output']: + scorename = os.path.join(args['--output'], os.path.basename(scorename)) + with open(scorename, 'w') as score_file: + score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( + ''.join([page_number, '_', table_number]), + table['nrows'], + table['ncols'], + table['empty_p'], + table['line_p'], + table['text_p'], + table['score'])) + if args['--debug']: + manager.debug_plot() + except Exception as e: + logging.exception(e.message, exc_info=True) + sys.exit() if args['--debug']: print("See 'camelot -h' for various parameters you can tweak.")