Add export to TableList and Table

pull/2/head
Vinayak Mehta 2018-09-07 05:13:34 +05:30
parent 557189da24
commit 0c329634e7
6 changed files with 169 additions and 246 deletions

View File

@ -9,9 +9,10 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
>>> tables = camelot.read_pdf("foo.pdf") >>> tables = camelot.read_pdf("foo.pdf")
>>> tables >>> tables
<TableList n=2> <TableList n=2>
>>> tables.to_csv(zip=True) # to_json, to_excel, to_html >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
>>> tables[0] >>> tables[0]
<Table shape=(3,4)> <Table shape=(3,4)>
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
>>> tables[0].parsing_report >>> tables[0].parsing_report
{ {
"accuracy": 96, "accuracy": 96,
@ -20,7 +21,6 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
"page": 1 "page": 1
} }
>>> df = tables[0].df >>> df = tables[0].df
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
</pre> </pre>
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.

View File

@ -1,6 +1,10 @@
import os
import json import json
import zipfile
import tempfile
import numpy as np import numpy as np
import pandas as pd
class Cell(object): class Cell(object):
@ -68,16 +72,46 @@ class Table(object):
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows] for c in cols] for r in rows]
self._df = None self.df = None
self._shape = (0, 0) self.shape = (0, 0)
self._accuracy = 0 self.accuracy = 0
self._whitespace = 0 self.whitespace = 0
self._order = None self.order = None
self._page = None self.page = None
def __repr__(self): def __repr__(self):
return '<{} shape={}>'.format(self.__class__.__name__, self._shape) return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
@property
def data(self):
"""
Returns
-------
"""
d = []
for row in self.cells:
d.append([cell.text.strip() for cell in row])
return d
@property
def parsing_report(self):
"""
Returns
-------
"""
# pretty?
report = {
'accuracy': self.accuracy,
'whitespace': self.whitespace,
'order': self.order,
'page': self.page
}
return report
def set_border(self): def set_border(self):
""" """
@ -253,119 +287,38 @@ class Table(object):
cell.hspan = True cell.hspan = True
return self return self
@property def to_csv(self, path, **kwargs):
def data(self): kw = {
""" 'encoding': 'utf-8',
'index': False,
Returns 'quoting': 1
-------
"""
d = []
for row in self.cells:
d.append([cell.text.strip() for cell in row])
return d
@property
def df(self):
"""
Returns
-------
"""
return self._df
@df.setter
def df(self, dataframe):
self._df = dataframe
@property
def shape(self):
"""
Returns
-------
"""
return self._shape
@shape.setter
def shape(self, s):
self._shape = s
@property
def accuracy(self):
"""
Returns
-------
"""
return self._accuracy
@accuracy.setter
def accuracy(self, a):
self._accuracy = a
@property
def whitespace(self):
"""
Returns
-------
"""
return self._whitespace
@whitespace.setter
def whitespace(self, w):
self._whitespace = w
@property
def order(self):
"""
Returns
-------
"""
return self._order
@order.setter
def order(self, o):
self._order = o
@property
def page(self):
"""
Returns
-------
"""
return self._page
@page.setter
def page(self, p):
self._page = p
@property
def parsing_report(self):
"""
Returns
-------
"""
# pretty?
report = {
'accuracy': self._accuracy,
'whitespace': self._whitespace,
'order': self._order,
'page': self._page
} }
return report kw.update(kwargs)
self.df.to_csv(path, **kw)
def to_json(self, path, **kwargs):
kw = {
'orient': 'records'
}
kw.update(kwargs)
json_string = self.df.to_json(**kw)
with open(path, 'w') as f:
f.write(json_string)
def to_excel(self, path, **kwargs):
kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
'encoding': 'utf-8'
}
kw.update(kwargs)
writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw)
writer.save()
def to_html(self, path, **kwargs):
html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f:
f.write(html_string)
class TableList(object): class TableList(object):
@ -385,72 +338,82 @@ class TableList(object):
def __getitem__(self, idx): def __getitem__(self, idx):
return self._tables[idx] return self._tables[idx]
@staticmethod
def _format_func(table, f):
return getattr(table, 'to_{}'.format(f))
def _write_file(self, f=None, **kwargs):
dirname = kwargs.get('dirname')
root = kwargs.get('root')
ext = kwargs.get('ext')
for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format(
root, table.page, table.order, ext))
filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f)
to_format(filepath)
def _compress_dir(self, **kwargs):
path = kwargs.get('path')
dirname = kwargs.get('dirname')
root = kwargs.get('root')
ext = kwargs.get('ext')
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
for table in self._tables:
filename = os.path.join('{}-page-{}-table-{}{}'.format(
root, table.page, table.order, ext))
filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath))
def export(self, path, f='csv', compress=False):
dirname = os.path.dirname(path)
basename = os.path.basename(path)
root, ext = os.path.splitext(basename)
if compress:
dirname = tempfile.mkdtemp()
kwargs = {
'path': path,
'dirname': dirname,
'root': root,
'ext': ext
}
if f in ['csv', 'json', 'html']:
self._write_file(f=f, **kwargs)
if compress:
self._compress_dir(**kwargs)
elif f == 'excel':
filepath = os.path.join(dirname, basename)
writer = pd.ExcelWriter(filepath)
for table in self._tables:
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
writer.save()
if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))
class Geometry(object): class Geometry(object):
""" """
""" """
def __init__(self): def __init__(self):
self._text = [] self.text = []
self._images = () self.images = ()
self._segments = () self.segments = ()
self._tables = [] self.tables = []
@property def __repr__(self):
def text(self): return '<{} text={} images={} segments={} tables={}>'.format(
""" self.__class__.__name__,
len(self.text),
Returns len(self.images),
------- len(self.segments),
len(self.tables))
"""
return self._text
@text.setter
def text(self, t):
self._text = t
@property
def images(self):
"""
Returns
-------
"""
return self._images
@images.setter
def images(self, i):
self._images = i
@property
def segments(self):
"""
Returns
-------
"""
return self._segments
@segments.setter
def segments(self, s):
self._segments = s
@property
def tables(self):
"""
Returns
-------
"""
return self._tables
@tables.setter
def tables(self, tb):
self._tables = tb
class GeometryList(object): class GeometryList(object):
@ -458,55 +421,15 @@ class GeometryList(object):
""" """
def __init__(self, geometry): def __init__(self, geometry):
self._text = [g.text for g in geometry] self.text = [g.text for g in geometry]
self._images = [g.images for g in geometry] self.images = [g.images for g in geometry]
self._segments = [g.segments for g in geometry] self.segments = [g.segments for g in geometry]
self._tables = [g.tables for g in geometry] self.tables = [g.tables for g in geometry]
def __repr__(self): def __repr__(self):
return '<{} text={} images={} segments={} tables={}>'.format( return '<{} text={} images={} segments={} tables={}>'.format(
self.__class__.__name__, self.__class__.__name__,
len(self._text), len(self.text),
len(self._images), len(self.images),
len(self._segments), len(self.segments),
len(self._tables)) len(self.tables))
@property
def text(self):
"""
Returns
-------
"""
return self._text
@property
def images(self):
"""
Returns
-------
"""
return self._images
@property
def segments(self):
"""
Returns
-------
"""
return self._segments
@property
def tables(self):
"""
Returns
-------
"""
return self._tables

View File

@ -17,7 +17,7 @@ class PDFHandler(object):
if not self.filename.endswith('.pdf'): if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.") raise TypeError("File format not supported.")
self.pages = self.__get_pages(self.filename, pages) self.pages = self.__get_pages(self.filename, pages)
self.temp = tempfile.mkdtemp() self.tempdir = tempfile.mkdtemp()
def __get_pages(self, filename, pages): def __get_pages(self, filename, pages):
# refactor # refactor
@ -47,7 +47,7 @@ class PDFHandler(object):
with open(filename, 'rb') as fileobj: with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
fname, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
@ -60,7 +60,7 @@ class PDFHandler(object):
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar) rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '': if rotation != '':
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext]) fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
outfile = PdfFileWriter() outfile = PdfFileWriter()
@ -86,8 +86,8 @@ class PDFHandler(object):
""" """
for p in self.pages: for p in self.pages:
self.__save_page(self.filename, p, self.temp) self.__save_page(self.filename, p, self.tempdir)
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages] for p in self.pages]
tables = [] tables = []
geometry = [] geometry = []

View File

@ -18,5 +18,5 @@ class BaseParser(object):
self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.horizontal_text = get_text_objects(self.layout, ltype="lh")
self.vertical_text = get_text_objects(self.layout, ltype="lv") self.vertical_text = get_text_objects(self.layout, ltype="lv")
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.basename, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
self.g = Geometry() self.g = Geometry()

View File

@ -85,7 +85,7 @@ class Lattice(BaseParser):
return t return t
def _generate_image(self): def _generate_image(self):
self.imagename = ''.join([self.basename, '.png']) self.imagename = ''.join([self.rootname, '.png'])
gs_call = [ gs_call = [
"-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename "-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
] ]
@ -164,7 +164,7 @@ class Lattice(BaseParser):
v_s = kwargs.get('v_s') v_s = kwargs.get('v_s')
h_s = kwargs.get('h_s') h_s = kwargs.get('h_s')
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError('No segments found on {}'.format(self.basename)) raise ValueError('No segments found on {}'.format(self.rootname))
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
@ -199,7 +199,7 @@ class Lattice(BaseParser):
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.basename).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace('page-', ''))
return table return table
@ -219,7 +219,7 @@ class Lattice(BaseParser):
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.basename))) os.path.basename(self.rootname)))
return [], self.g return [], self.g
self._generate_image() self._generate_image()

View File

@ -32,12 +32,6 @@ class Stream(BaseParser):
self.flag_size = flag_size self.flag_size = flag_size
self.debug = debug self.debug = debug
def _validate_columns(self):
if self.table_area is not None and self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of table_area and columns"
" should be equal")
@staticmethod @staticmethod
def _text_bbox(t_bbox): def _text_bbox(t_bbox):
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
@ -125,6 +119,12 @@ class Stream(BaseParser):
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
return cols return cols
def _validate_columns(self):
if self.table_area is not None and self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of table_area and columns"
" should be equal")
def _generate_table_bbox(self): def _generate_table_bbox(self):
if self.table_area is not None: if self.table_area is not None:
table_bbox = {} table_bbox = {}
@ -169,7 +169,7 @@ class Stream(BaseParser):
if ncols == 1: if ncols == 1:
# no tables condition # no tables condition
warnings.warn("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.basename))) os.path.basename(self.rootname)))
cols = [(t.x0, t.x1) cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r] for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), mtol=self.mtol) cols = self._merge_columns(sorted(cols), mtol=self.mtol)
@ -213,7 +213,7 @@ class Stream(BaseParser):
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.basename).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace('page-', ''))
return table return table
@ -233,7 +233,7 @@ class Stream(BaseParser):
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.basename))) os.path.basename(self.rootname)))
return [], self.g return [], self.g
self._generate_table_bbox() self._generate_table_bbox()