From 0c329634e738be01c937d724f47826fbe8d5843a Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 7 Sep 2018 05:13:34 +0530 Subject: [PATCH] Add export to TableList and Table --- README.md | 4 +- camelot/core.py | 373 +++++++++++++++---------------------- camelot/handlers.py | 10 +- camelot/parsers/base.py | 2 +- camelot/parsers/lattice.py | 8 +- camelot/parsers/stream.py | 18 +- 6 files changed, 169 insertions(+), 246 deletions(-) diff --git a/README.md b/README.md index 4a5227f..b5a3e8b 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,10 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of >>> tables = camelot.read_pdf("foo.pdf") >>> tables <TableList n=2> ->>> tables.to_csv(zip=True) # to_json, to_excel, to_html +>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html >>> tables[0] <Table shape=(3,4)> +>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html >>> tables[0].parsing_report { "accuracy": 96, @@ -20,7 +21,6 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of "page": 1 } >>> df = tables[0].df ->>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. diff --git a/camelot/core.py b/camelot/core.py index 9f98f16..2003979 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,6 +1,10 @@ +import os import json +import zipfile +import tempfile import numpy as np +import pandas as pd class Cell(object): @@ -68,16 +72,46 @@ class Table(object): self.rows = rows self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] - self._df = None - self._shape = (0, 0) - self._accuracy = 0 - self._whitespace = 0 - self._order = None - self._page = None + self.df = None + self.shape = (0, 0) + self.accuracy = 0 + self.whitespace = 0 + self.order = None + self.page = None def __repr__(self): return '<{} shape={}>'.format(self.__class__.__name__, self._shape) + @property + def data(self): + """ + + Returns + ------- + + """ + d = [] + for row in self.cells: + d.append([cell.text.strip() for cell in row]) + return d + + @property + def parsing_report(self): + """ + + Returns + ------- + + """ + # pretty? + report = { + 'accuracy': self.accuracy, + 'whitespace': self.whitespace, + 'order': self.order, + 'page': self.page + } + return report + def set_border(self): """ @@ -253,119 +287,38 @@ class Table(object): cell.hspan = True return self - @property - def data(self): - """ - - Returns - ------- - - """ - d = [] - for row in self.cells: - d.append([cell.text.strip() for cell in row]) - return d - - @property - def df(self): - """ - - Returns - ------- - - """ - return self._df - - @df.setter - def df(self, dataframe): - self._df = dataframe - - @property - def shape(self): - """ - - Returns - ------- - - """ - return self._shape - - @shape.setter - def shape(self, s): - self._shape = s - - @property - def accuracy(self): - """ - - Returns - ------- - - """ - return self._accuracy - - @accuracy.setter - def accuracy(self, a): - self._accuracy = a - - @property - def whitespace(self): - """ - - Returns - ------- - - """ - return self._whitespace - - @whitespace.setter - def whitespace(self, w): - self._whitespace = w - - @property - def order(self): - """ - - Returns - ------- - - """ - return self._order - - @order.setter - def order(self, o): - self._order = o - - @property - def page(self): - """ - - Returns - ------- - - """ - return self._page - - @page.setter - def page(self, p): - self._page = p - - @property - def parsing_report(self): - """ - - Returns - ------- - - """ - # pretty? - report = { - 'accuracy': self._accuracy, - 'whitespace': self._whitespace, - 'order': self._order, - 'page': self._page + def to_csv(self, path, **kwargs): + kw = { + 'encoding': 'utf-8', + 'index': False, + 'quoting': 1 } - return report + kw.update(kwargs) + self.df.to_csv(path, **kw) + + def to_json(self, path, **kwargs): + kw = { + 'orient': 'records' + } + kw.update(kwargs) + json_string = self.df.to_json(**kw) + with open(path, 'w') as f: + f.write(json_string) + + def to_excel(self, path, **kwargs): + kw = { + 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), + 'encoding': 'utf-8' + } + kw.update(kwargs) + writer = pd.ExcelWriter(path) + self.df.to_excel(writer, **kw) + writer.save() + + def to_html(self, path, **kwargs): + html_string = self.df.to_html(**kwargs) + with open(path, 'w') as f: + f.write(html_string) class TableList(object): @@ -385,72 +338,82 @@ class TableList(object): def __getitem__(self, idx): return self._tables[idx] + @staticmethod + def _format_func(table, f): + return getattr(table, 'to_{}'.format(f)) + + def _write_file(self, f=None, **kwargs): + dirname = kwargs.get('dirname') + root = kwargs.get('root') + ext = kwargs.get('ext') + for table in self._tables: + filename = os.path.join('{}-page-{}-table-{}{}'.format( + root, table.page, table.order, ext)) + filepath = os.path.join(dirname, filename) + to_format = self._format_func(table, f) + to_format(filepath) + + def _compress_dir(self, **kwargs): + path = kwargs.get('path') + dirname = kwargs.get('dirname') + root = kwargs.get('root') + ext = kwargs.get('ext') + zipname = os.path.join(os.path.dirname(path), root) + '.zip' + with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + for table in self._tables: + filename = os.path.join('{}-page-{}-table-{}{}'.format( + root, table.page, table.order, ext)) + filepath = os.path.join(dirname, filename) + z.write(filepath, os.path.basename(filepath)) + + def export(self, path, f='csv', compress=False): + dirname = os.path.dirname(path) + basename = os.path.basename(path) + root, ext = os.path.splitext(basename) + if compress: + dirname = tempfile.mkdtemp() + + kwargs = { + 'path': path, + 'dirname': dirname, + 'root': root, + 'ext': ext + } + + if f in ['csv', 'json', 'html']: + self._write_file(f=f, **kwargs) + if compress: + self._compress_dir(**kwargs) + elif f == 'excel': + filepath = os.path.join(dirname, basename) + writer = pd.ExcelWriter(filepath) + for table in self._tables: + sheet_name = 'page-{}-table-{}'.format(table.page, table.order) + table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8') + writer.save() + if compress: + zipname = os.path.join(os.path.dirname(path), root) + '.zip' + with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + z.write(filepath, os.path.basename(filepath)) + class Geometry(object): """ """ def __init__(self): - self._text = [] - self._images = () - self._segments = () - self._tables = [] + self.text = [] + self.images = () + self.segments = () + self.tables = [] - @property - def text(self): - """ - - Returns - ------- - - """ - return self._text - - @text.setter - def text(self, t): - self._text = t - - @property - def images(self): - """ - - Returns - ------- - - """ - return self._images - - @images.setter - def images(self, i): - self._images = i - - @property - def segments(self): - """ - - Returns - ------- - - """ - return self._segments - - @segments.setter - def segments(self, s): - self._segments = s - - @property - def tables(self): - """ - - Returns - ------- - - """ - return self._tables - - @tables.setter - def tables(self, tb): - self._tables = tb + def __repr__(self): + return '<{} text={} images={} segments={} tables={}>'.format( + self.__class__.__name__, + len(self.text), + len(self.images), + len(self.segments), + len(self.tables)) class GeometryList(object): @@ -458,55 +421,15 @@ class GeometryList(object): """ def __init__(self, geometry): - self._text = [g.text for g in geometry] - self._images = [g.images for g in geometry] - self._segments = [g.segments for g in geometry] - self._tables = [g.tables for g in geometry] + self.text = [g.text for g in geometry] + self.images = [g.images for g in geometry] + self.segments = [g.segments for g in geometry] + self.tables = [g.tables for g in geometry] def __repr__(self): return '<{} text={} images={} segments={} tables={}>'.format( self.__class__.__name__, - len(self._text), - len(self._images), - len(self._segments), - len(self._tables)) - - @property - def text(self): - """ - - Returns - ------- - - """ - return self._text - - @property - def images(self): - """ - - Returns - ------- - - """ - return self._images - - @property - def segments(self): - """ - - Returns - ------- - - """ - return self._segments - - @property - def tables(self): - """ - - Returns - ------- - - """ - return self._tables \ No newline at end of file + len(self.text), + len(self.images), + len(self.segments), + len(self.tables)) \ No newline at end of file diff --git a/camelot/handlers.py b/camelot/handlers.py index c4bcfd8..af4db00 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -17,7 +17,7 @@ class PDFHandler(object): if not self.filename.endswith('.pdf'): raise TypeError("File format not supported.") self.pages = self.__get_pages(self.filename, pages) - self.temp = tempfile.mkdtemp() + self.tempdir = tempfile.mkdtemp() def __get_pages(self, filename, pages): # refactor @@ -47,7 +47,7 @@ class PDFHandler(object): with open(filename, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) - fname, fext = os.path.splitext(fpath) + froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) @@ -60,7 +60,7 @@ class PDFHandler(object): ltchar = get_text_objects(layout, ltype="char") rotation = get_rotation(lttextlh, lttextlv, ltchar) if rotation != '': - fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext]) + fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) outfile = PdfFileWriter() @@ -86,8 +86,8 @@ class PDFHandler(object): """ for p in self.pages: - self.__save_page(self.filename, p, self.temp) - pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) + self.__save_page(self.filename, p, self.tempdir) + pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] tables = [] geometry = [] diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 0c1b54b..79cb986 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -18,5 +18,5 @@ class BaseParser(object): self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions - self.basename, __ = os.path.splitext(self.filename) + self.rootname, __ = os.path.splitext(self.filename) self.g = Geometry() \ No newline at end of file diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 282a96a..a758af7 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -85,7 +85,7 @@ class Lattice(BaseParser): return t def _generate_image(self): - self.imagename = ''.join([self.basename, '.png']) + self.imagename = ''.join([self.rootname, '.png']) gs_call = [ "-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename ] @@ -164,7 +164,7 @@ class Lattice(BaseParser): v_s = kwargs.get('v_s') h_s = kwargs.get('h_s') if v_s is None or h_s is None: - raise ValueError('No segments found on {}'.format(self.basename)) + raise ValueError('No segments found on {}'.format(self.rootname)) table = Table(cols, rows) # set table edges to True using ver+hor lines @@ -199,7 +199,7 @@ class Lattice(BaseParser): table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 - table.page = int(os.path.basename(self.basename).replace('page-', '')) + table.page = int(os.path.basename(self.rootname).replace('page-', '')) return table @@ -219,7 +219,7 @@ class Lattice(BaseParser): if not self.horizontal_text: warnings.warn("No tables found on {}".format( - os.path.basename(self.basename))) + os.path.basename(self.rootname))) return [], self.g self._generate_image() diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 1849a0c..fe3a3e8 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -32,12 +32,6 @@ class Stream(BaseParser): self.flag_size = flag_size self.debug = debug - def _validate_columns(self): - if self.table_area is not None and self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("Length of table_area and columns" - " should be equal") - @staticmethod def _text_bbox(t_bbox): xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) @@ -125,6 +119,12 @@ class Stream(BaseParser): for i in range(0, len(cols) - 1)] return cols + def _validate_columns(self): + if self.table_area is not None and self.columns is not None: + if len(self.table_area) != len(self.columns): + raise ValueError("Length of table_area and columns" + " should be equal") + def _generate_table_bbox(self): if self.table_area is not None: table_bbox = {} @@ -169,7 +169,7 @@ class Stream(BaseParser): if ncols == 1: # no tables condition warnings.warn("No tables found on {}".format( - os.path.basename(self.basename))) + os.path.basename(self.rootname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = self._merge_columns(sorted(cols), mtol=self.mtol) @@ -213,7 +213,7 @@ class Stream(BaseParser): table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 - table.page = int(os.path.basename(self.basename).replace('page-', '')) + table.page = int(os.path.basename(self.rootname).replace('page-', '')) return table @@ -233,7 +233,7 @@ class Stream(BaseParser): if not self.horizontal_text: warnings.warn("No tables found on {}".format( - os.path.basename(self.basename))) + os.path.basename(self.rootname))) return [], self.g self._generate_table_bbox()