From 0c329634e738be01c937d724f47826fbe8d5843a Mon Sep 17 00:00:00 2001
From: Vinayak Mehta <vmehta94@gmail.com>
Date: Fri, 7 Sep 2018 05:13:34 +0530
Subject: [PATCH] Add export to TableList and Table

---
 README.md                  |   4 +-
 camelot/core.py            | 373 +++++++++++++++----------------------
 camelot/handlers.py        |  10 +-
 camelot/parsers/base.py    |   2 +-
 camelot/parsers/lattice.py |   8 +-
 camelot/parsers/stream.py  |  18 +-
 6 files changed, 169 insertions(+), 246 deletions(-)
diff --git a/README.md b/README.md
index 4a5227f..b5a3e8b 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,10 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
 >>> tables = camelot.read_pdf("foo.pdf")
 >>> tables
 &lt;TableList n=2&gt;
->>> tables.to_csv(zip=True) # to_json, to_excel, to_html
+>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
 >>> tables[0]
 &lt;Table shape=(3,4)&gt;
+>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
 >>> tables[0].parsing_report
 {
     "accuracy": 96,
@@ -20,7 +21,6 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
     "page": 1
 }
 >>> df = tables[0].df
->>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
 </pre>
 
 Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
diff --git a/camelot/core.py b/camelot/core.py
index 9f98f16..2003979 100644
--- a/camelot/core.py
+++ b/camelot/core.py
@@ -1,6 +1,10 @@
+import os
 import json
+import zipfile
+import tempfile
 
 import numpy as np
+import pandas as pd
 
 
 class Cell(object):
@@ -68,16 +72,46 @@ class Table(object):
         self.rows = rows
         self.cells = [[Cell(c[0], r[1], c[1], r[0])
                        for c in cols] for r in rows]
-        self._df = None
-        self._shape = (0, 0)
-        self._accuracy = 0
-        self._whitespace = 0
-        self._order = None
-        self._page = None
+        self.df = None
+        self.shape = (0, 0)
+        self.accuracy = 0
+        self.whitespace = 0
+        self.order = None
+        self.page = None
 
     def __repr__(self):
         return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
 
+    @property
+    def data(self):
+        """
+
+        Returns
+        -------
+
+        """
+        d = []
+        for row in self.cells:
+            d.append([cell.text.strip() for cell in row])
+        return d
+
+    @property
+    def parsing_report(self):
+        """
+
+        Returns
+        -------
+
+        """
+        # pretty?
+        report = {
+            'accuracy': self.accuracy,
+            'whitespace': self.whitespace,
+            'order': self.order,
+            'page': self.page
+        }
+        return report
+
     def set_border(self):
         """
 
@@ -253,119 +287,38 @@ class Table(object):
                         cell.hspan = True
         return self
 
-    @property
-    def data(self):
-        """
-
-        Returns
-        -------
-
-        """
-        d = []
-        for row in self.cells:
-            d.append([cell.text.strip() for cell in row])
-        return d
-
-    @property
-    def df(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._df
-
-    @df.setter
-    def df(self, dataframe):
-        self._df = dataframe
-
-    @property
-    def shape(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._shape
-
-    @shape.setter
-    def shape(self, s):
-        self._shape = s
-
-    @property
-    def accuracy(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._accuracy
-
-    @accuracy.setter
-    def accuracy(self, a):
-        self._accuracy = a
-
-    @property
-    def whitespace(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._whitespace
-
-    @whitespace.setter
-    def whitespace(self, w):
-        self._whitespace = w
-
-    @property
-    def order(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._order
-
-    @order.setter
-    def order(self, o):
-        self._order = o
-
-    @property
-    def page(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._page
-
-    @page.setter
-    def page(self, p):
-        self._page = p
-
-    @property
-    def parsing_report(self):
-        """
-
-        Returns
-        -------
-
-        """
-        # pretty?
-        report = {
-            'accuracy': self._accuracy,
-            'whitespace': self._whitespace,
-            'order': self._order,
-            'page': self._page
+    def to_csv(self, path, **kwargs):
+        kw = {
+            'encoding': 'utf-8',
+            'index': False,
+            'quoting': 1
         }
-        return report
+        kw.update(kwargs)
+        self.df.to_csv(path, **kw)
+
+    def to_json(self, path, **kwargs):
+        kw = {
+            'orient': 'records'
+        }
+        kw.update(kwargs)
+        json_string = self.df.to_json(**kw)
+        with open(path, 'w') as f:
+            f.write(json_string)
+
+    def to_excel(self, path, **kwargs):
+        kw = {
+            'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
+            'encoding': 'utf-8'
+        }
+        kw.update(kwargs)
+        writer = pd.ExcelWriter(path)
+        self.df.to_excel(writer, **kw)
+        writer.save()
+
+    def to_html(self, path, **kwargs):
+        html_string = self.df.to_html(**kwargs)
+        with open(path, 'w') as f:
+            f.write(html_string)
 
 
 class TableList(object):
@@ -385,72 +338,82 @@ class TableList(object):
     def __getitem__(self, idx):
         return self._tables[idx]
 
+    @staticmethod
+    def _format_func(table, f):
+        return getattr(table, 'to_{}'.format(f))
+
+    def _write_file(self, f=None, **kwargs):
+        dirname = kwargs.get('dirname')
+        root = kwargs.get('root')
+        ext = kwargs.get('ext')
+        for table in self._tables:
+            filename = os.path.join('{}-page-{}-table-{}{}'.format(
+                                    root, table.page, table.order, ext))
+            filepath = os.path.join(dirname, filename)
+            to_format = self._format_func(table, f)
+            to_format(filepath)
+
+    def _compress_dir(self, **kwargs):
+        path = kwargs.get('path')
+        dirname = kwargs.get('dirname')
+        root = kwargs.get('root')
+        ext = kwargs.get('ext')
+        zipname = os.path.join(os.path.dirname(path), root) + '.zip'
+        with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
+            for table in self._tables:
+                filename = os.path.join('{}-page-{}-table-{}{}'.format(
+                                        root, table.page, table.order, ext))
+                filepath = os.path.join(dirname, filename)
+                z.write(filepath, os.path.basename(filepath))
+
+    def export(self, path, f='csv', compress=False):
+        dirname = os.path.dirname(path)
+        basename = os.path.basename(path)
+        root, ext = os.path.splitext(basename)
+        if compress:
+            dirname = tempfile.mkdtemp()
+
+        kwargs = {
+            'path': path,
+            'dirname': dirname,
+            'root': root,
+            'ext': ext
+        }
+
+        if f in ['csv', 'json', 'html']:
+            self._write_file(f=f, **kwargs)
+            if compress:
+                self._compress_dir(**kwargs)
+        elif f == 'excel':
+            filepath = os.path.join(dirname, basename)
+            writer = pd.ExcelWriter(filepath)
+            for table in self._tables:
+                sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
+                table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
+            writer.save()
+            if compress:
+                zipname = os.path.join(os.path.dirname(path), root) + '.zip'
+                with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
+                    z.write(filepath, os.path.basename(filepath))
+
 
 class Geometry(object):
     """
 
     """
     def __init__(self):
-        self._text = []
-        self._images = ()
-        self._segments = ()
-        self._tables = []
+        self.text = []
+        self.images = ()
+        self.segments = ()
+        self.tables = []
 
-    @property
-    def text(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._text
-
-    @text.setter
-    def text(self, t):
-        self._text = t
-
-    @property
-    def images(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._images
-
-    @images.setter
-    def images(self, i):
-        self._images = i
-
-    @property
-    def segments(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._segments
-
-    @segments.setter
-    def segments(self, s):
-        self._segments = s
-
-    @property
-    def tables(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._tables
-
-    @tables.setter
-    def tables(self, tb):
-        self._tables = tb
+    def __repr__(self):
+        return '<{} text={} images={} segments={} tables={}>'.format(
+            self.__class__.__name__,
+            len(self.text),
+            len(self.images),
+            len(self.segments),
+            len(self.tables))
 
 
 class GeometryList(object):
@@ -458,55 +421,15 @@ class GeometryList(object):
 
     """
     def __init__(self, geometry):
-        self._text = [g.text for g in geometry]
-        self._images = [g.images for g in geometry]
-        self._segments = [g.segments for g in geometry]
-        self._tables = [g.tables for g in geometry]
+        self.text = [g.text for g in geometry]
+        self.images = [g.images for g in geometry]
+        self.segments = [g.segments for g in geometry]
+        self.tables = [g.tables for g in geometry]
 
     def __repr__(self):
         return '<{} text={} images={} segments={} tables={}>'.format(
             self.__class__.__name__,
-            len(self._text),
-            len(self._images),
-            len(self._segments),
-            len(self._tables))
-
-    @property
-    def text(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._text
-
-    @property
-    def images(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._images
-
-    @property
-    def segments(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._segments
-
-    @property
-    def tables(self):
-        """
-
-        Returns
-        -------
-
-        """
-        return self._tables
\ No newline at end of file
+            len(self.text),
+            len(self.images),
+            len(self.segments),
+            len(self.tables))
\ No newline at end of file
diff --git a/camelot/handlers.py b/camelot/handlers.py
index c4bcfd8..af4db00 100644
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@@ -17,7 +17,7 @@ class PDFHandler(object):
         if not self.filename.endswith('.pdf'):
             raise TypeError("File format not supported.")
         self.pages = self.__get_pages(self.filename, pages)
-        self.temp = tempfile.mkdtemp()
+        self.tempdir = tempfile.mkdtemp()
 
     def __get_pages(self, filename, pages):
         # refactor
@@ -47,7 +47,7 @@ class PDFHandler(object):
         with open(filename, 'rb') as fileobj:
             infile = PdfFileReader(fileobj, strict=False)
             fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
-            fname, fext = os.path.splitext(fpath)
+            froot, fext = os.path.splitext(fpath)
             p = infile.getPage(page - 1)
             outfile = PdfFileWriter()
             outfile.addPage(p)
@@ -60,7 +60,7 @@ class PDFHandler(object):
             ltchar = get_text_objects(layout, ltype="char")
             rotation = get_rotation(lttextlh, lttextlv, ltchar)
             if rotation != '':
-                fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
+                fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
                 os.rename(fpath, fpath_new)
                 infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
                 outfile = PdfFileWriter()
@@ -86,8 +86,8 @@ class PDFHandler(object):
 
         """
         for p in self.pages:
-            self.__save_page(self.filename, p, self.temp)
-        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
+            self.__save_page(self.filename, p, self.tempdir)
+        pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
                  for p in self.pages]
         tables = []
         geometry = []
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index 0c1b54b..79cb986 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -18,5 +18,5 @@ class BaseParser(object):
         self.horizontal_text = get_text_objects(self.layout, ltype="lh")
         self.vertical_text = get_text_objects(self.layout, ltype="lv")
         self.pdf_width, self.pdf_height = self.dimensions
-        self.basename, __ = os.path.splitext(self.filename)
+        self.rootname, __ = os.path.splitext(self.filename)
         self.g = Geometry()
\ No newline at end of file
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index 282a96a..a758af7 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -85,7 +85,7 @@ class Lattice(BaseParser):
         return t
 
     def _generate_image(self):
-        self.imagename = ''.join([self.basename, '.png'])
+        self.imagename = ''.join([self.rootname, '.png'])
         gs_call = [
             "-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
         ]
@@ -164,7 +164,7 @@ class Lattice(BaseParser):
         v_s = kwargs.get('v_s')
         h_s = kwargs.get('h_s')
         if v_s is None or h_s is None:
-            raise ValueError('No segments found on {}'.format(self.basename))
+            raise ValueError('No segments found on {}'.format(self.rootname))
 
         table = Table(cols, rows)
         # set table edges to True using ver+hor lines
@@ -199,7 +199,7 @@ class Lattice(BaseParser):
         table.accuracy = accuracy
         table.whitespace = whitespace
         table.order = table_idx + 1
-        table.page = int(os.path.basename(self.basename).replace('page-', ''))
+        table.page = int(os.path.basename(self.rootname).replace('page-', ''))
 
         return table
 
@@ -219,7 +219,7 @@ class Lattice(BaseParser):
 
         if not self.horizontal_text:
             warnings.warn("No tables found on {}".format(
-                os.path.basename(self.basename)))
+                os.path.basename(self.rootname)))
             return [], self.g
 
         self._generate_image()
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 1849a0c..fe3a3e8 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -32,12 +32,6 @@ class Stream(BaseParser):
         self.flag_size = flag_size
         self.debug = debug
 
-    def _validate_columns(self):
-        if self.table_area is not None and self.columns is not None:
-            if len(self.table_area) != len(self.columns):
-                raise ValueError("Length of table_area and columns"
-                                 " should be equal")
-
     @staticmethod
     def _text_bbox(t_bbox):
         xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
@@ -125,6 +119,12 @@ class Stream(BaseParser):
                 for i in range(0, len(cols) - 1)]
         return cols
 
+    def _validate_columns(self):
+        if self.table_area is not None and self.columns is not None:
+            if len(self.table_area) != len(self.columns):
+                raise ValueError("Length of table_area and columns"
+                                 " should be equal")
+
     def _generate_table_bbox(self):
         if self.table_area is not None:
             table_bbox = {}
@@ -169,7 +169,7 @@ class Stream(BaseParser):
             if ncols == 1:
                 # no tables condition
                 warnings.warn("No tables found on {}".format(
-                    os.path.basename(self.basename)))
+                    os.path.basename(self.rootname)))
             cols = [(t.x0, t.x1)
                 for r in rows_grouped if len(r) == ncols for t in r]
             cols = self._merge_columns(sorted(cols), mtol=self.mtol)
@@ -213,7 +213,7 @@ class Stream(BaseParser):
         table.accuracy = accuracy
         table.whitespace = whitespace
         table.order = table_idx + 1
-        table.page = int(os.path.basename(self.basename).replace('page-', ''))
+        table.page = int(os.path.basename(self.rootname).replace('page-', ''))
 
         return table
 
@@ -233,7 +233,7 @@ class Stream(BaseParser):
 
         if not self.horizontal_text:
             warnings.warn("No tables found on {}".format(
-                os.path.basename(self.basename)))
+                os.path.basename(self.rootname)))
             return [], self.g
 
         self._generate_table_bbox()