Add export to TableList and Table
parent
557189da24
commit
0c329634e7
|
|
@ -9,9 +9,10 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
|
||||||
>>> tables = camelot.read_pdf("foo.pdf")
|
>>> tables = camelot.read_pdf("foo.pdf")
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList n=2>
|
<TableList n=2>
|
||||||
>>> tables.to_csv(zip=True) # to_json, to_excel, to_html
|
>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
|
||||||
>>> tables[0]
|
>>> tables[0]
|
||||||
<Table shape=(3,4)>
|
<Table shape=(3,4)>
|
||||||
|
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||||
>>> tables[0].parsing_report
|
>>> tables[0].parsing_report
|
||||||
{
|
{
|
||||||
"accuracy": 96,
|
"accuracy": 96,
|
||||||
|
|
@ -20,7 +21,6 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
|
||||||
"page": 1
|
"page": 1
|
||||||
}
|
}
|
||||||
>>> df = tables[0].df
|
>>> df = tables[0].df
|
||||||
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||||
|
|
|
||||||
373
camelot/core.py
373
camelot/core.py
|
|
@ -1,6 +1,10 @@
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
class Cell(object):
|
class Cell(object):
|
||||||
|
|
@ -68,16 +72,46 @@ class Table(object):
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||||
for c in cols] for r in rows]
|
for c in cols] for r in rows]
|
||||||
self._df = None
|
self.df = None
|
||||||
self._shape = (0, 0)
|
self.shape = (0, 0)
|
||||||
self._accuracy = 0
|
self.accuracy = 0
|
||||||
self._whitespace = 0
|
self.whitespace = 0
|
||||||
self._order = None
|
self.order = None
|
||||||
self._page = None
|
self.page = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
|
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def data(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
|
||||||
|
"""
|
||||||
|
d = []
|
||||||
|
for row in self.cells:
|
||||||
|
d.append([cell.text.strip() for cell in row])
|
||||||
|
return d
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parsing_report(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
|
||||||
|
"""
|
||||||
|
# pretty?
|
||||||
|
report = {
|
||||||
|
'accuracy': self.accuracy,
|
||||||
|
'whitespace': self.whitespace,
|
||||||
|
'order': self.order,
|
||||||
|
'page': self.page
|
||||||
|
}
|
||||||
|
return report
|
||||||
|
|
||||||
def set_border(self):
|
def set_border(self):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -253,119 +287,38 @@ class Table(object):
|
||||||
cell.hspan = True
|
cell.hspan = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@property
|
def to_csv(self, path, **kwargs):
|
||||||
def data(self):
|
kw = {
|
||||||
"""
|
'encoding': 'utf-8',
|
||||||
|
'index': False,
|
||||||
Returns
|
'quoting': 1
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
d = []
|
|
||||||
for row in self.cells:
|
|
||||||
d.append([cell.text.strip() for cell in row])
|
|
||||||
return d
|
|
||||||
|
|
||||||
@property
|
|
||||||
def df(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._df
|
|
||||||
|
|
||||||
@df.setter
|
|
||||||
def df(self, dataframe):
|
|
||||||
self._df = dataframe
|
|
||||||
|
|
||||||
@property
|
|
||||||
def shape(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._shape
|
|
||||||
|
|
||||||
@shape.setter
|
|
||||||
def shape(self, s):
|
|
||||||
self._shape = s
|
|
||||||
|
|
||||||
@property
|
|
||||||
def accuracy(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._accuracy
|
|
||||||
|
|
||||||
@accuracy.setter
|
|
||||||
def accuracy(self, a):
|
|
||||||
self._accuracy = a
|
|
||||||
|
|
||||||
@property
|
|
||||||
def whitespace(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._whitespace
|
|
||||||
|
|
||||||
@whitespace.setter
|
|
||||||
def whitespace(self, w):
|
|
||||||
self._whitespace = w
|
|
||||||
|
|
||||||
@property
|
|
||||||
def order(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._order
|
|
||||||
|
|
||||||
@order.setter
|
|
||||||
def order(self, o):
|
|
||||||
self._order = o
|
|
||||||
|
|
||||||
@property
|
|
||||||
def page(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._page
|
|
||||||
|
|
||||||
@page.setter
|
|
||||||
def page(self, p):
|
|
||||||
self._page = p
|
|
||||||
|
|
||||||
@property
|
|
||||||
def parsing_report(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
# pretty?
|
|
||||||
report = {
|
|
||||||
'accuracy': self._accuracy,
|
|
||||||
'whitespace': self._whitespace,
|
|
||||||
'order': self._order,
|
|
||||||
'page': self._page
|
|
||||||
}
|
}
|
||||||
return report
|
kw.update(kwargs)
|
||||||
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
|
def to_json(self, path, **kwargs):
|
||||||
|
kw = {
|
||||||
|
'orient': 'records'
|
||||||
|
}
|
||||||
|
kw.update(kwargs)
|
||||||
|
json_string = self.df.to_json(**kw)
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(json_string)
|
||||||
|
|
||||||
|
def to_excel(self, path, **kwargs):
|
||||||
|
kw = {
|
||||||
|
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||||
|
'encoding': 'utf-8'
|
||||||
|
}
|
||||||
|
kw.update(kwargs)
|
||||||
|
writer = pd.ExcelWriter(path)
|
||||||
|
self.df.to_excel(writer, **kw)
|
||||||
|
writer.save()
|
||||||
|
|
||||||
|
def to_html(self, path, **kwargs):
|
||||||
|
html_string = self.df.to_html(**kwargs)
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(html_string)
|
||||||
|
|
||||||
|
|
||||||
class TableList(object):
|
class TableList(object):
|
||||||
|
|
@ -385,72 +338,82 @@ class TableList(object):
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
return self._tables[idx]
|
return self._tables[idx]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _format_func(table, f):
|
||||||
|
return getattr(table, 'to_{}'.format(f))
|
||||||
|
|
||||||
|
def _write_file(self, f=None, **kwargs):
|
||||||
|
dirname = kwargs.get('dirname')
|
||||||
|
root = kwargs.get('root')
|
||||||
|
ext = kwargs.get('ext')
|
||||||
|
for table in self._tables:
|
||||||
|
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||||
|
root, table.page, table.order, ext))
|
||||||
|
filepath = os.path.join(dirname, filename)
|
||||||
|
to_format = self._format_func(table, f)
|
||||||
|
to_format(filepath)
|
||||||
|
|
||||||
|
def _compress_dir(self, **kwargs):
|
||||||
|
path = kwargs.get('path')
|
||||||
|
dirname = kwargs.get('dirname')
|
||||||
|
root = kwargs.get('root')
|
||||||
|
ext = kwargs.get('ext')
|
||||||
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
|
for table in self._tables:
|
||||||
|
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||||
|
root, table.page, table.order, ext))
|
||||||
|
filepath = os.path.join(dirname, filename)
|
||||||
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
|
def export(self, path, f='csv', compress=False):
|
||||||
|
dirname = os.path.dirname(path)
|
||||||
|
basename = os.path.basename(path)
|
||||||
|
root, ext = os.path.splitext(basename)
|
||||||
|
if compress:
|
||||||
|
dirname = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
'path': path,
|
||||||
|
'dirname': dirname,
|
||||||
|
'root': root,
|
||||||
|
'ext': ext
|
||||||
|
}
|
||||||
|
|
||||||
|
if f in ['csv', 'json', 'html']:
|
||||||
|
self._write_file(f=f, **kwargs)
|
||||||
|
if compress:
|
||||||
|
self._compress_dir(**kwargs)
|
||||||
|
elif f == 'excel':
|
||||||
|
filepath = os.path.join(dirname, basename)
|
||||||
|
writer = pd.ExcelWriter(filepath)
|
||||||
|
for table in self._tables:
|
||||||
|
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
|
||||||
|
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
|
||||||
|
writer.save()
|
||||||
|
if compress:
|
||||||
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
|
|
||||||
class Geometry(object):
|
class Geometry(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._text = []
|
self.text = []
|
||||||
self._images = ()
|
self.images = ()
|
||||||
self._segments = ()
|
self.segments = ()
|
||||||
self._tables = []
|
self.tables = []
|
||||||
|
|
||||||
@property
|
def __repr__(self):
|
||||||
def text(self):
|
return '<{} text={} images={} segments={} tables={}>'.format(
|
||||||
"""
|
self.__class__.__name__,
|
||||||
|
len(self.text),
|
||||||
Returns
|
len(self.images),
|
||||||
-------
|
len(self.segments),
|
||||||
|
len(self.tables))
|
||||||
"""
|
|
||||||
return self._text
|
|
||||||
|
|
||||||
@text.setter
|
|
||||||
def text(self, t):
|
|
||||||
self._text = t
|
|
||||||
|
|
||||||
@property
|
|
||||||
def images(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._images
|
|
||||||
|
|
||||||
@images.setter
|
|
||||||
def images(self, i):
|
|
||||||
self._images = i
|
|
||||||
|
|
||||||
@property
|
|
||||||
def segments(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._segments
|
|
||||||
|
|
||||||
@segments.setter
|
|
||||||
def segments(self, s):
|
|
||||||
self._segments = s
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tables(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._tables
|
|
||||||
|
|
||||||
@tables.setter
|
|
||||||
def tables(self, tb):
|
|
||||||
self._tables = tb
|
|
||||||
|
|
||||||
|
|
||||||
class GeometryList(object):
|
class GeometryList(object):
|
||||||
|
|
@ -458,55 +421,15 @@ class GeometryList(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, geometry):
|
def __init__(self, geometry):
|
||||||
self._text = [g.text for g in geometry]
|
self.text = [g.text for g in geometry]
|
||||||
self._images = [g.images for g in geometry]
|
self.images = [g.images for g in geometry]
|
||||||
self._segments = [g.segments for g in geometry]
|
self.segments = [g.segments for g in geometry]
|
||||||
self._tables = [g.tables for g in geometry]
|
self.tables = [g.tables for g in geometry]
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
return '<{} text={} images={} segments={} tables={}>'.format(
|
||||||
self.__class__.__name__,
|
self.__class__.__name__,
|
||||||
len(self._text),
|
len(self.text),
|
||||||
len(self._images),
|
len(self.images),
|
||||||
len(self._segments),
|
len(self.segments),
|
||||||
len(self._tables))
|
len(self.tables))
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._text
|
|
||||||
|
|
||||||
@property
|
|
||||||
def images(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._images
|
|
||||||
|
|
||||||
@property
|
|
||||||
def segments(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._segments
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tables(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._tables
|
|
||||||
|
|
@ -17,7 +17,7 @@ class PDFHandler(object):
|
||||||
if not self.filename.endswith('.pdf'):
|
if not self.filename.endswith('.pdf'):
|
||||||
raise TypeError("File format not supported.")
|
raise TypeError("File format not supported.")
|
||||||
self.pages = self.__get_pages(self.filename, pages)
|
self.pages = self.__get_pages(self.filename, pages)
|
||||||
self.temp = tempfile.mkdtemp()
|
self.tempdir = tempfile.mkdtemp()
|
||||||
|
|
||||||
def __get_pages(self, filename, pages):
|
def __get_pages(self, filename, pages):
|
||||||
# refactor
|
# refactor
|
||||||
|
|
@ -47,7 +47,7 @@ class PDFHandler(object):
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filename, 'rb') as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||||
fname, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
|
|
@ -60,7 +60,7 @@ class PDFHandler(object):
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||||
if rotation != '':
|
if rotation != '':
|
||||||
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
|
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
|
|
@ -86,8 +86,8 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self.__save_page(self.filename, p, self.temp)
|
self.__save_page(self.filename, p, self.tempdir)
|
||||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
tables = []
|
tables = []
|
||||||
geometry = []
|
geometry = []
|
||||||
|
|
|
||||||
|
|
@ -18,5 +18,5 @@ class BaseParser(object):
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.basename, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
self.g = Geometry()
|
self.g = Geometry()
|
||||||
|
|
@ -85,7 +85,7 @@ class Lattice(BaseParser):
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def _generate_image(self):
|
def _generate_image(self):
|
||||||
self.imagename = ''.join([self.basename, '.png'])
|
self.imagename = ''.join([self.rootname, '.png'])
|
||||||
gs_call = [
|
gs_call = [
|
||||||
"-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
|
"-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
|
||||||
]
|
]
|
||||||
|
|
@ -164,7 +164,7 @@ class Lattice(BaseParser):
|
||||||
v_s = kwargs.get('v_s')
|
v_s = kwargs.get('v_s')
|
||||||
h_s = kwargs.get('h_s')
|
h_s = kwargs.get('h_s')
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError('No segments found on {}'.format(self.basename))
|
raise ValueError('No segments found on {}'.format(self.rootname))
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
|
|
@ -199,7 +199,7 @@ class Lattice(BaseParser):
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.basename).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
@ -219,7 +219,7 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.basename)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return [], self.g
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image()
|
||||||
|
|
|
||||||
|
|
@ -32,12 +32,6 @@ class Stream(BaseParser):
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def _validate_columns(self):
|
|
||||||
if self.table_area is not None and self.columns is not None:
|
|
||||||
if len(self.table_area) != len(self.columns):
|
|
||||||
raise ValueError("Length of table_area and columns"
|
|
||||||
" should be equal")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _text_bbox(t_bbox):
|
def _text_bbox(t_bbox):
|
||||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
|
@ -125,6 +119,12 @@ class Stream(BaseParser):
|
||||||
for i in range(0, len(cols) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
|
def _validate_columns(self):
|
||||||
|
if self.table_area is not None and self.columns is not None:
|
||||||
|
if len(self.table_area) != len(self.columns):
|
||||||
|
raise ValueError("Length of table_area and columns"
|
||||||
|
" should be equal")
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
|
|
@ -169,7 +169,7 @@ class Stream(BaseParser):
|
||||||
if ncols == 1:
|
if ncols == 1:
|
||||||
# no tables condition
|
# no tables condition
|
||||||
warnings.warn("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.basename)))
|
os.path.basename(self.rootname)))
|
||||||
cols = [(t.x0, t.x1)
|
cols = [(t.x0, t.x1)
|
||||||
for r in rows_grouped if len(r) == ncols for t in r]
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = self._merge_columns(sorted(cols), mtol=self.mtol)
|
cols = self._merge_columns(sorted(cols), mtol=self.mtol)
|
||||||
|
|
@ -213,7 +213,7 @@ class Stream(BaseParser):
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.basename).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
@ -233,7 +233,7 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.basename)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return [], self.g
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue