From 970256e19d8a05463888321eb99a1eb4908f8fed Mon Sep 17 00:00:00 2001
From: Vinayak Mehta <vmehta94@gmail.com>
Date: Sat, 7 Jan 2017 16:37:56 +0530
Subject: [PATCH] Add OCR support for image based pdfs with lines

* Cosmits

* Remove unnecessary kwargs

* Direct ghostscript call output to /dev/null

* Change char_margin's default value

* Add image attribute in Table and Cell

* Add OCR

* Fix coordinates

* Add table_area

* Add ocr options to cli

* Direct ghostscript call output to /dev/null

* Add ocr dostring

* Add requirements

* Update README
---
 README.md           |   4 ++
 camelot/__init__.py |   2 +-
 camelot/cell.py     |   1 +
 camelot/ocr.py      | 148 ++++++++++++++++++++++++++++++++++++++++++++
 camelot/pdf.py      |   4 +-
 camelot/table.py    |   1 +
 requirements.txt    |   2 +
 tools/camelot       |  87 ++++++++++++++++++++++++++
 8 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 camelot/ocr.py

diff --git a/README.md b/README.md
index b1125d8..4c9e97e 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,10 @@ Currently, camelot works under Python 2.7.
 
 The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php).
 
+### Optional
+
+You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english.
+
 ## Installation
 
 Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by
diff --git a/camelot/__init__.py b/camelot/__init__.py
index a3f6f78..ed19a28 100644
--- a/camelot/__init__.py
+++ b/camelot/__init__.py
@@ -1,3 +1,3 @@
 __version__ = '1.0.0'
 
-__all__ = ['pdf', 'lattice', 'stream']
+__all__ = ['pdf', 'lattice', 'stream', 'ocr']
diff --git a/camelot/cell.py b/camelot/cell.py
index 159f571..8dfe8d3 100644
--- a/camelot/cell.py
+++ b/camelot/cell.py
@@ -79,6 +79,7 @@ class Cell:
         self.text = ''
         self.spanning_h = False
         self.spanning_v = False
+        self.image = None
 
     def add_text(self, text):
         """Adds text to cell.
diff --git a/camelot/ocr.py b/camelot/ocr.py
new file mode 100644
index 0000000..57ddb54
--- /dev/null
+++ b/camelot/ocr.py
@@ -0,0 +1,148 @@
+import os
+import subprocess
+
+import pyocr
+from PIL import Image
+
+from .table import Table
+from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
+                      find_table_joints)
+from .utils import merge_close_values, encode_list
+
+
+class OCR:
+    """Uses optical character recognition to get text out of image based pdfs.
+    Currently works only on pdfs with lines.
+
+    Parameters
+    ----------
+    table_area : list
+        List of strings of the form x1,y1,x2,y2 where
+        (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
+        coordinate space, denoting table areas to analyze.
+        (optional, default: None)
+
+    mtol : list
+        List of ints specifying m-tolerance parameters.
+        (optional, default: [2])
+
+    dpi : int
+        Dots per inch.
+        (optional, default: 300)
+
+    lang : string
+        Language to be used for OCR.
+        (optional, default: 'eng')
+
+    scale : int
+        Used to divide the height/width of a pdf to get a structuring
+        element for image processing.
+        (optional, default: 15)
+
+    debug : string
+        {'contour', 'line', 'joint', 'table'}
+        Set to one of the above values to generate a matplotlib plot
+        of detected contours, lines, joints and the table generated.
+        (optional, default: None)
+    """
+    def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15,
+                 debug=None):
+
+        self.method = 'ocr'
+        self.table_area = table_area
+        self.mtol = mtol
+        self.tool = pyocr.get_available_tools()[0] # fix this
+        self.dpi = dpi
+        self.lang = lang
+        self.scale = scale
+        self.debug = debug
+
+    def get_tables(self, pdfname):
+        if self.tool is None:
+            return None
+        bname, __ = os.path.splitext(pdfname)
+        imagename = ''.join([bname, '.png'])
+
+        gs_call = [
+            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
+            pdfname
+        ]
+        if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
+            gs_call.insert(0, "gs")
+        else:
+            gs_call.insert(0, "gsc")
+        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
+            stderr=subprocess.STDOUT)
+
+        img, threshold = adaptive_threshold(imagename)
+        vmask, v_segments = find_lines(threshold, direction='vertical',
+            scale=self.scale)
+        hmask, h_segments = find_lines(threshold, direction='horizontal',
+            scale=self.scale)
+
+        if self.table_area is not None:
+            areas = []
+            for area in self.table_area:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = int(x1)
+                y1 = int(y1)
+                x2 = int(x2)
+                y2 = int(y2)
+                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            table_bbox = find_table_joints(areas, vmask, hmask)
+        else:
+            contours = find_table_contours(vmask, hmask)
+            table_bbox = find_table_joints(contours, vmask, hmask)
+
+        if self.debug:
+            self.debug_images = (img, table_bbox)
+            self.debug_segments = (v_segments, h_segments)
+            self.debug_tables = []
+
+        if len(self.mtol) == 1 and self.mtol[0] == 2:
+            self.mtol = self.mtol * len(table_bbox)
+
+        page = {}
+        tables = {}
+        table_no = 0
+        for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
+            table_data = {}
+            cols, rows = zip(*table_bbox[k])
+            cols, rows = list(cols), list(rows)
+            cols.extend([k[0], k[2]])
+            rows.extend([k[1], k[3]])
+            cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
+            rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no])
+            cols = [(cols[i], cols[i + 1])
+                    for i in range(0, len(cols) - 1)]
+            rows = [(rows[i], rows[i + 1])
+                    for i in range(0, len(rows) - 1)]
+            table = Table(cols, rows)
+            if self.debug:
+                self.debug_tables.append(table)
+            table.image = img[k[3]:k[1],k[0]:k[2]]
+            for i in range(len(table.cells)):
+                for j in range(len(table.cells[i])):
+                    x1 = int(table.cells[i][j].x1)
+                    y1 = int(table.cells[i][j].y1)
+                    x2 = int(table.cells[i][j].x2)
+                    y2 = int(table.cells[i][j].y2)
+                    table.cells[i][j].image = img[y1:y2,x1:x2]
+                    text = self.tool.image_to_string(
+                        Image.fromarray(table.cells[i][j].image),
+                        lang=self.lang,
+                        builder=pyocr.builders.TextBuilder()
+                    )
+                    table.cells[i][j].add_text(text)
+            ar = table.get_list()
+            ar.reverse()
+            ar = encode_list(ar)
+            table_data['data'] = ar
+            tables['table-{0}'.format(table_no + 1)] = table_data
+            table_no += 1
+        page[os.path.basename(bname)] = tables
+
+        if self.debug:
+            return None
+
+        return page
\ No newline at end of file
diff --git a/camelot/pdf.py b/camelot/pdf.py
index 11cc6b4..85c2dbc 100644
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@@ -126,7 +126,7 @@ class Pdf:
                 if self.extractor.method == 'stream':
                     self.debug = self.extractor.debug
                     self.debug_text = []
-                elif self.extractor.method == 'lattice':
+                elif self.extractor.method in ['lattice', 'ocr']:
                     self.debug = self.extractor.debug
                     self.debug_images = []
                     self.debug_segments = []
@@ -138,7 +138,7 @@ class Pdf:
                 if self.extractor.debug:
                     if self.extractor.method == 'stream':
                         self.debug_text.append(self.extractor.debug_text)
-                    elif self.extractor.method == 'lattice':
+                    elif self.extractor.method in ['lattice', 'ocr']:
                         self.debug_images.append(self.extractor.debug_images)
                         self.debug_segments.append(self.extractor.debug_segments)
                         self.debug_tables.append(self.extractor.debug_tables)
diff --git a/camelot/table.py b/camelot/table.py
index 0978e7e..8549300 100644
--- a/camelot/table.py
+++ b/camelot/table.py
@@ -34,6 +34,7 @@ class Table:
         self.cells = [[Cell(c[0], r[1], c[1], r[0])
                        for c in cols] for r in rows]
         self.nocont_ = 0
+        self.image = None
 
     def set_all_edges(self):
         """Sets all table edges to True.
diff --git a/requirements.txt b/requirements.txt
index 6dda062..826e271 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,7 @@ matplotlib
 nose
 pdfminer
 pyexcel-xlsx
+Pillow
+pyocr
 PyPDF2
 Sphinx
diff --git a/tools/camelot b/tools/camelot
index 02fa233..837072f 100755
--- a/tools/camelot
+++ b/tools/camelot
@@ -17,6 +17,7 @@ from PyPDF2 import PdfFileReader
 from camelot.pdf import Pdf
 from camelot.lattice import Lattice
 from camelot.stream import Stream
+from camelot.ocr import OCR
 
 
 doc = """
@@ -52,6 +53,7 @@ options:
 camelot methods:
  lattice  Looks for lines between data.
  stream   Looks for spaces between data.
+ ocr      Looks for lines in image based pdfs.
 
 See 'camelot <method> -h' for more information on a specific method.
 """
@@ -101,6 +103,26 @@ options:
 """
 
 
+ocr_doc = """
+OCR method looks for lines in image based pdfs.
+
+usage:
+ camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
+
+options:
+ -t, --tarea <tarea>  Specific table areas to analyze.
+ -m, --mtol <mtol>    Tolerance to account for when merging lines
+                      which are very close. [default: 2]
+ -D, --dpi <dpi>      Dots per inch, specify image quality to be used for OCR.
+                      [default: 300]
+ -l, --lang <lang>    Specify language to be used for OCR. [default: eng]
+ -s, --scale <scale>  Scaling factor. Large scaling factor leads to
+                      smaller lines being detected. [default: 15]
+ -d, --debug <debug>  Debug by visualizing pdf geometry.
+                      (contour,line,joint,table) Example: -d table
+"""
+
+
 def plot_table_barchart(r, c, p, pno, tno):
     row_idx = [i + 1 for i, row in enumerate(r)]
     col_idx = [i + 1 for i, col in enumerate(c)]
@@ -315,6 +337,8 @@ if __name__ == '__main__':
         args.update(docopt(lattice_doc, argv=argv))
     elif args['<method>'] == 'stream':
         args.update(docopt(stream_doc, argv=argv))
+    elif args['<method>'] == 'ocr':
+        args.update(docopt(ocr_doc, argv=argv))
 
     vprint = print if args['--verbose'] else lambda *a, **k: None
     filename = args['<file>']
@@ -487,6 +511,69 @@ if __name__ == '__main__':
         except Exception as e:
             logging.exception(e.message, exc_info=True)
             sys.exit()
+    elif args['<method>'] == 'ocr':
+        try:
+            tarea = args['--tarea'] if args['--tarea'] else None
+            mtol = [int(m) for m in args['--mtol']]
+            manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']),
+                                  lang=args['--lang'], scale=int(args['--scale']),
+                                  debug=args['--debug']),
+                          filename,
+                          pagenos=p,
+                          parallel=args['--parallel'],
+                          clean=True)
+            data = manager.extract()
+
+            processing_time = time.time() - start_time
+            vprint("Finished processing in", processing_time, "seconds")
+            logging.info("Finished processing in " + str(processing_time) + " seconds")
+
+            if args['--plot']:
+                if args['--output']:
+                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
+                plot_type = args['--plot'].split(',')
+                if 'page' in plot_type:
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            plot_table_barchart(table['r_nempty_cells'],
+                                table['c_nempty_cells'],
+                                table['empty_p'],
+                                page_number,
+                                table_number)
+
+                if 'all' in plot_type:
+                    plot_all_barchart(data, pngname)
+
+                if 'rc' in plot_type:
+                    plot_rc_piechart(data, pngname)
+
+            if args['--print-stats']:
+                print_stats(data, processing_time)
+
+            if args['--save-stats']:
+                if args['--output']:
+                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
+                with open(scorename, 'w') as score_file:
+                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
+                                ''.join([page_number, '_', table_number]),
+                                table['nrows'],
+                                table['ncols'],
+                                table['empty_p'],
+                                table['line_p'],
+                                table['text_p'],
+                                table['score']))
+            if args['--debug']:
+                manager.debug_plot()
+        except Exception as e:
+            logging.exception(e.message, exc_info=True)
+            sys.exit()
 
     if args['--debug']:
         print("See 'camelot <method> -h' for various parameters you can tweak.")