Decouple debug geometry from tables
parent
941994f0bf
commit
b9d77cb983
|
|
@ -8,7 +8,7 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
|
||||||
>>> import camelot
|
>>> import camelot
|
||||||
>>> tables = camelot.read_pdf("foo.pdf")
|
>>> tables = camelot.read_pdf("foo.pdf")
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableSet n=2>
|
<TableList n=2>
|
||||||
>>> tables.to_csv(zip=True) # to_json, to_excel, to_html
|
>>> tables.to_csv(zip=True) # to_json, to_excel, to_html
|
||||||
>>> tables[0]
|
>>> tables[0]
|
||||||
<Table shape=(3,4)>
|
<Table shape=(3,4)>
|
||||||
|
|
@ -19,8 +19,8 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
|
||||||
"time_taken": 0.5,
|
"time_taken": 0.5,
|
||||||
"page": 1
|
"page": 1
|
||||||
}
|
}
|
||||||
|
>>> df = tables[0].df
|
||||||
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||||
>>> df = tables[0].to_df()
|
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
|
from .plot import plot_geometry
|
||||||
|
|
@ -21,7 +21,6 @@ class Cell(object):
|
||||||
self.text = ''
|
self.text = ''
|
||||||
self.spanning_h = False
|
self.spanning_h = False
|
||||||
self.spanning_v = False
|
self.spanning_v = False
|
||||||
self.image = None
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
pass
|
pass
|
||||||
|
|
@ -49,8 +48,6 @@ class Table(object):
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||||
for c in cols] for r in rows]
|
for c in cols] for r in rows]
|
||||||
self.nocont_ = 0
|
|
||||||
self.image = None
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
pass
|
pass
|
||||||
|
|
@ -227,9 +224,66 @@ class Table(object):
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
class TableSet(object):
|
class TableList(list):
|
||||||
def __init__(self):
|
def __init__(self, tables):
|
||||||
pass
|
self._tables = tables
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
pass
|
return '<{} tables={}>'.format(
|
||||||
|
self.__class__.__name__, len(self._tables))
|
||||||
|
|
||||||
|
|
||||||
|
class Geometry(object):
|
||||||
|
def __init__(self):
|
||||||
|
self._text = []
|
||||||
|
self._images = []
|
||||||
|
self._segments = []
|
||||||
|
self._tables = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
@text.setter
|
||||||
|
def text(self, t):
|
||||||
|
self._text = t
|
||||||
|
|
||||||
|
@property
|
||||||
|
def images(self):
|
||||||
|
return self._images
|
||||||
|
|
||||||
|
@images.setter
|
||||||
|
def images(self, i):
|
||||||
|
self._images = i
|
||||||
|
|
||||||
|
@property
|
||||||
|
def segments(self):
|
||||||
|
return self._segments
|
||||||
|
|
||||||
|
@segments.setter
|
||||||
|
def segments(self, s):
|
||||||
|
self._segments = s
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tables(self):
|
||||||
|
return self._tables
|
||||||
|
|
||||||
|
@tables.setter
|
||||||
|
def tables(self, tb):
|
||||||
|
self._tables = tb
|
||||||
|
|
||||||
|
|
||||||
|
class GeometryList(object):
|
||||||
|
def __init__(self, geometry):
|
||||||
|
self._text = [g.text for g in geometry]
|
||||||
|
self._images = [g.images for g in geometry]
|
||||||
|
self._segments = [g.segments for g in geometry]
|
||||||
|
self._tables = [g.tables for g in geometry]
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<{} text={} images={} segments={} tables={}>'.format(
|
||||||
|
self.__class__.__name__,
|
||||||
|
len(self._text),
|
||||||
|
len(self._images),
|
||||||
|
len(self._segments),
|
||||||
|
len(self._tables))
|
||||||
|
|
@ -74,10 +74,11 @@ class PDFHandler(object):
|
||||||
self.__save_page(self.filename, p, self.temp)
|
self.__save_page(self.filename, p, self.temp)
|
||||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
tables = {}
|
tables = []
|
||||||
|
geometry = []
|
||||||
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
table = parser.get_tables(p)
|
t, g = parser.extract_tables(p)
|
||||||
if table is not None:
|
tables.extend(t)
|
||||||
tables.update(table)
|
geometry.extend(g)
|
||||||
return tables
|
return TableList(tables), GeometryList(geometry)
|
||||||
|
|
@ -4,4 +4,5 @@ from .handlers import PDFHandler
|
||||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
# explicit type conversion
|
# explicit type conversion
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
return p.parse(mesh=mesh, **kwargs)
|
tables, __ = p.parse(mesh=mesh, **kwargs)
|
||||||
|
return tables
|
||||||
|
|
@ -10,7 +10,7 @@ import subprocess
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .core import Table
|
from .core import Table, Geometry
|
||||||
from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
|
from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints)
|
find_table_joints)
|
||||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||||
|
|
@ -30,192 +30,6 @@ def _reduce_method(m):
|
||||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
|
|
||||||
def _text_bbox(t_bbox):
|
|
||||||
"""Returns bounding box for the text present on a page.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_bbox : dict
|
|
||||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
|
||||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text_bbox : tuple
|
|
||||||
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
|
|
||||||
space.
|
|
||||||
"""
|
|
||||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
|
||||||
text_bbox = (xmin, ymin, xmax, ymax)
|
|
||||||
return text_bbox
|
|
||||||
|
|
||||||
|
|
||||||
def _group_rows(text, ytol=2):
|
|
||||||
"""Groups PDFMiner text objects into rows using their
|
|
||||||
y-coordinates taking into account some tolerance ytol.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
|
|
||||||
ytol : int
|
|
||||||
Tolerance parameter.
|
|
||||||
(optional, default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
"""
|
|
||||||
row_y = 0
|
|
||||||
rows = []
|
|
||||||
temp = []
|
|
||||||
for t in text:
|
|
||||||
# is checking for upright necessary?
|
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
|
||||||
# type(obj) is LTChar]):
|
|
||||||
if t.get_text().strip():
|
|
||||||
if not np.isclose(row_y, t.y0, atol=ytol):
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
temp = []
|
|
||||||
row_y = t.y0
|
|
||||||
temp.append(t)
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
__ = rows.pop(0) # hacky
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_columns(l, mtol=0):
|
|
||||||
"""Merges column boundaries if they overlap or lie within some
|
|
||||||
tolerance mtol.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
l : list
|
|
||||||
List of column coordinate tuples.
|
|
||||||
|
|
||||||
mtol : int
|
|
||||||
TODO
|
|
||||||
(optional, default: 0)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
merged : list
|
|
||||||
List of merged column coordinate tuples.
|
|
||||||
"""
|
|
||||||
merged = []
|
|
||||||
for higher in l:
|
|
||||||
if not merged:
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
lower = merged[-1]
|
|
||||||
if mtol >= 0:
|
|
||||||
if (higher[0] <= lower[1] or
|
|
||||||
np.isclose(higher[0], lower[1], atol=mtol)):
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
elif mtol < 0:
|
|
||||||
if higher[0] <= lower[1]:
|
|
||||||
if np.isclose(higher[0], lower[1], atol=abs(mtol)):
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
return merged
|
|
||||||
|
|
||||||
|
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
|
||||||
"""Makes row coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
rows_grouped : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
|
|
||||||
text_y_max : int
|
|
||||||
|
|
||||||
text_y_min : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
List of continuous row coordinate tuples.
|
|
||||||
"""
|
|
||||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
|
||||||
if len(r) > 0 else 0 for r in rows_grouped]
|
|
||||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
|
||||||
rows.insert(0, text_y_max)
|
|
||||||
rows.append(text_y_min)
|
|
||||||
rows = [(rows[i], rows[i + 1])
|
|
||||||
for i in range(0, len(rows) - 1)]
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def _join_columns(cols, text_x_min, text_x_max):
|
|
||||||
"""Makes column coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column coordinate tuples.
|
|
||||||
|
|
||||||
text_x_min : int
|
|
||||||
|
|
||||||
text_y_max : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column coordinate tuples.
|
|
||||||
"""
|
|
||||||
cols = sorted(cols)
|
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1])
|
|
||||||
for i in range(0, len(cols) - 1)]
|
|
||||||
return cols
|
|
||||||
|
|
||||||
|
|
||||||
def _add_columns(cols, text, ytol):
|
|
||||||
"""Adds columns to existing list by taking into account
|
|
||||||
the text that lies outside the current column coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column coordinate tuples.
|
|
||||||
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
|
|
||||||
ytol : int
|
|
||||||
Tolerance parameter.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column coordinate tuples.
|
|
||||||
"""
|
|
||||||
if text:
|
|
||||||
text = _group_rows(text, ytol=ytol)
|
|
||||||
elements = [len(r) for r in text]
|
|
||||||
new_cols = [(t.x0, t.x1)
|
|
||||||
for r in text if len(r) == max(elements) for t in r]
|
|
||||||
cols.extend(_merge_columns(sorted(new_cols)))
|
|
||||||
return cols
|
|
||||||
|
|
||||||
|
|
||||||
class Stream:
|
class Stream:
|
||||||
"""Stream looks for spaces between text elements to form a table.
|
"""Stream looks for spaces between text elements to form a table.
|
||||||
|
|
||||||
|
|
@ -283,7 +97,193 @@ class Stream:
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
@staticmethod
|
||||||
|
def _text_bbox(t_bbox):
|
||||||
|
"""Returns bounding box for the text present on a page.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||||
|
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text_bbox : tuple
|
||||||
|
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
|
||||||
|
space.
|
||||||
|
"""
|
||||||
|
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
text_bbox = (xmin, ymin, xmax, ymax)
|
||||||
|
return text_bbox
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _group_rows(text, ytol=2):
|
||||||
|
"""Groups PDFMiner text objects into rows using their
|
||||||
|
y-coordinates taking into account some tolerance ytol.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
|
ytol : int
|
||||||
|
Tolerance parameter.
|
||||||
|
(optional, default: 2)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
"""
|
||||||
|
row_y = 0
|
||||||
|
rows = []
|
||||||
|
temp = []
|
||||||
|
for t in text:
|
||||||
|
# is checking for upright necessary?
|
||||||
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||||
|
# type(obj) is LTChar]):
|
||||||
|
if t.get_text().strip():
|
||||||
|
if not np.isclose(row_y, t.y0, atol=ytol):
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
temp = []
|
||||||
|
row_y = t.y0
|
||||||
|
temp.append(t)
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
__ = rows.pop(0) # hacky
|
||||||
|
return rows
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_columns(l, mtol=0):
|
||||||
|
"""Merges column boundaries if they overlap or lie within some
|
||||||
|
tolerance mtol.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
l : list
|
||||||
|
List of column coordinate tuples.
|
||||||
|
|
||||||
|
mtol : int
|
||||||
|
TODO
|
||||||
|
(optional, default: 0)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
List of merged column coordinate tuples.
|
||||||
|
"""
|
||||||
|
merged = []
|
||||||
|
for higher in l:
|
||||||
|
if not merged:
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
lower = merged[-1]
|
||||||
|
if mtol >= 0:
|
||||||
|
if (higher[0] <= lower[1] or
|
||||||
|
np.isclose(higher[0], lower[1], atol=mtol)):
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
elif mtol < 0:
|
||||||
|
if higher[0] <= lower[1]:
|
||||||
|
if np.isclose(higher[0], lower[1], atol=abs(mtol)):
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||||
|
"""Makes row coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
rows_grouped : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
|
||||||
|
text_y_max : int
|
||||||
|
|
||||||
|
text_y_min : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
List of continuous row coordinate tuples.
|
||||||
|
"""
|
||||||
|
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||||
|
if len(r) > 0 else 0 for r in rows_grouped]
|
||||||
|
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||||
|
rows.insert(0, text_y_max)
|
||||||
|
rows.append(text_y_min)
|
||||||
|
rows = [(rows[i], rows[i + 1])
|
||||||
|
for i in range(0, len(rows) - 1)]
|
||||||
|
return rows
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_columns(cols, text, ytol):
|
||||||
|
"""Adds columns to existing list by taking into account
|
||||||
|
the text that lies outside the current column coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column coordinate tuples.
|
||||||
|
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
|
ytol : int
|
||||||
|
Tolerance parameter.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column coordinate tuples.
|
||||||
|
"""
|
||||||
|
if text:
|
||||||
|
text = Stream._group_rows(text, ytol=ytol)
|
||||||
|
elements = [len(r) for r in text]
|
||||||
|
new_cols = [(t.x0, t.x1)
|
||||||
|
for r in text if len(r) == max(elements) for t in r]
|
||||||
|
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
||||||
|
return cols
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_columns(cols, text_x_min, text_x_max):
|
||||||
|
"""Makes column coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column coordinate tuples.
|
||||||
|
|
||||||
|
text_x_min : int
|
||||||
|
|
||||||
|
text_y_max : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column coordinate tuples.
|
||||||
|
"""
|
||||||
|
cols = sorted(cols)
|
||||||
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
|
cols.insert(0, text_x_min)
|
||||||
|
cols.append(text_x_max)
|
||||||
|
cols = [(cols[i], cols[i + 1])
|
||||||
|
for i in range(0, len(cols) - 1)]
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def extract_tables(self, pdfname):
|
||||||
"""Expects a single page pdf as input with rotation corrected.
|
"""Expects a single page pdf as input with rotation corrected.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -308,11 +308,13 @@ class Stream:
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
return {os.path.basename(bname): None}
|
return {os.path.basename(bname): None}
|
||||||
|
|
||||||
|
g = Geometry()
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_text = []
|
text = []
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
||||||
return None
|
g.text = text
|
||||||
|
return [None], [g]
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.columns is not None:
|
if self.columns is not None:
|
||||||
|
|
@ -354,9 +356,9 @@ class Stream:
|
||||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||||
for direction in t_bbox:
|
for direction in t_bbox:
|
||||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox)
|
||||||
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
|
rows_grouped = self._group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
|
||||||
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
guess = False
|
guess = False
|
||||||
|
|
@ -380,7 +382,7 @@ class Stream:
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
cols = [(t.x0, t.x1)
|
cols = [(t.x0, t.x1)
|
||||||
for r in rows_grouped if len(r) == ncols for t in r]
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
|
cols = self._merge_columns(sorted(cols), mtol=mtolerance[table_no])
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
|
|
@ -392,8 +394,8 @@ class Stream:
|
||||||
for t in t_bbox[direction]
|
for t in t_bbox[direction]
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||||
inner_text.extend(outer_text)
|
inner_text.extend(outer_text)
|
||||||
cols = _add_columns(cols, inner_text, ytolerance[table_no])
|
cols = self._add_columns(cols, inner_text, ytolerance[table_no])
|
||||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
|
|
@ -433,87 +435,6 @@ class Stream:
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def _reduce_index(t, idx, shift_text):
|
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
|
||||||
cell.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table : object
|
|
||||||
camelot.table.Table
|
|
||||||
|
|
||||||
idx : list
|
|
||||||
List of tuples of the form (r_idx, c_idx, text).
|
|
||||||
|
|
||||||
shift_text : list
|
|
||||||
{'l', 'r', 't', 'b'}
|
|
||||||
Select one or more from above and pass them as a list to
|
|
||||||
specify where the text in a spanning cell should flow.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
indices : list
|
|
||||||
List of tuples of the form (idx, text) where idx is the reduced
|
|
||||||
index of row/column and text is the an lttextline substring.
|
|
||||||
"""
|
|
||||||
indices = []
|
|
||||||
for r_idx, c_idx, text in idx:
|
|
||||||
for d in shift_text:
|
|
||||||
if d == 'l':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
|
||||||
while not t.cells[r_idx][c_idx].left:
|
|
||||||
c_idx -= 1
|
|
||||||
if d == 'r':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
|
||||||
while not t.cells[r_idx][c_idx].right:
|
|
||||||
c_idx += 1
|
|
||||||
if d == 't':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
|
||||||
while not t.cells[r_idx][c_idx].top:
|
|
||||||
r_idx -= 1
|
|
||||||
if d == 'b':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
|
||||||
r_idx += 1
|
|
||||||
indices.append((r_idx, c_idx, text))
|
|
||||||
return indices
|
|
||||||
|
|
||||||
|
|
||||||
def _fill_spanning(t, fill=None):
|
|
||||||
"""Fills spanning cells.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : object
|
|
||||||
camelot.table.Table
|
|
||||||
|
|
||||||
fill : list
|
|
||||||
{'h', 'v'}
|
|
||||||
Specify to fill spanning cells in horizontal or vertical
|
|
||||||
direction.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : object
|
|
||||||
camelot.table.Table
|
|
||||||
"""
|
|
||||||
for f in fill:
|
|
||||||
if f == "h":
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
for j in range(len(t.cells[i])):
|
|
||||||
if t.cells[i][j].get_text().strip() == '':
|
|
||||||
if t.cells[i][j].spanning_h and not t.cells[i][j].left:
|
|
||||||
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
|
|
||||||
elif f == "v":
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
for j in range(len(t.cells[i])):
|
|
||||||
if t.cells[i][j].get_text().strip() == '':
|
|
||||||
if t.cells[i][j].spanning_v and not t.cells[i][j].top:
|
|
||||||
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
class Lattice:
|
class Lattice:
|
||||||
"""Lattice looks for lines in the pdf to form a table.
|
"""Lattice looks for lines in the pdf to form a table.
|
||||||
|
|
||||||
|
|
@ -617,7 +538,88 @@ class Lattice:
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
@staticmethod
|
||||||
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
cell.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : object
|
||||||
|
camelot.table.Table
|
||||||
|
|
||||||
|
idx : list
|
||||||
|
List of tuples of the form (r_idx, c_idx, text).
|
||||||
|
|
||||||
|
shift_text : list
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more from above and pass them as a list to
|
||||||
|
specify where the text in a spanning cell should flow.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
indices : list
|
||||||
|
List of tuples of the form (idx, text) where idx is the reduced
|
||||||
|
index of row/column and text is the an lttextline substring.
|
||||||
|
"""
|
||||||
|
indices = []
|
||||||
|
for r_idx, c_idx, text in idx:
|
||||||
|
for d in shift_text:
|
||||||
|
if d == 'l':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
|
while not t.cells[r_idx][c_idx].left:
|
||||||
|
c_idx -= 1
|
||||||
|
if d == 'r':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
|
while not t.cells[r_idx][c_idx].right:
|
||||||
|
c_idx += 1
|
||||||
|
if d == 't':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
|
while not t.cells[r_idx][c_idx].top:
|
||||||
|
r_idx -= 1
|
||||||
|
if d == 'b':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
|
while not t.cells[r_idx][c_idx].bottom:
|
||||||
|
r_idx += 1
|
||||||
|
indices.append((r_idx, c_idx, text))
|
||||||
|
return indices
|
||||||
|
|
||||||
|
|
||||||
|
def _fill_spanning(t, fill=None):
|
||||||
|
"""Fills spanning cells.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
camelot.table.Table
|
||||||
|
|
||||||
|
fill : list
|
||||||
|
{'h', 'v'}
|
||||||
|
Specify to fill spanning cells in horizontal or vertical
|
||||||
|
direction.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : object
|
||||||
|
camelot.table.Table
|
||||||
|
"""
|
||||||
|
for f in fill:
|
||||||
|
if f == "h":
|
||||||
|
for i in range(len(t.cells)):
|
||||||
|
for j in range(len(t.cells[i])):
|
||||||
|
if t.cells[i][j].get_text().strip() == '':
|
||||||
|
if t.cells[i][j].spanning_h and not t.cells[i][j].left:
|
||||||
|
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
|
||||||
|
elif f == "v":
|
||||||
|
for i in range(len(t.cells)):
|
||||||
|
for j in range(len(t.cells[i])):
|
||||||
|
if t.cells[i][j].get_text().strip() == '':
|
||||||
|
if t.cells[i][j].spanning_v and not t.cells[i][j].top:
|
||||||
|
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
|
||||||
|
return t
|
||||||
|
|
||||||
|
def extract_tables(self, pdfname):
|
||||||
"""Expects a single page pdf as input with rotation corrected.
|
"""Expects a single page pdf as input with rotation corrected.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -696,15 +698,16 @@ class Lattice:
|
||||||
else:
|
else:
|
||||||
jtolerance = copy.deepcopy(self.jtol)
|
jtolerance = copy.deepcopy(self.jtol)
|
||||||
|
|
||||||
|
g = Geometry()
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_images = (img, table_bbox)
|
g.images = [(img, table_bbox)]
|
||||||
|
|
||||||
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
|
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
|
||||||
h_segments, factors_pdf)
|
h_segments, factors_pdf)
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_segments = (v_segments, h_segments)
|
g.segments = [(v_segments, h_segments)]
|
||||||
self.debug_tables = []
|
_tables = []
|
||||||
|
|
||||||
page = {}
|
page = {}
|
||||||
tables = {}
|
tables = {}
|
||||||
|
|
@ -737,15 +740,13 @@ class Lattice:
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
|
table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
|
||||||
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
|
||||||
table_data['line_p'] = 100 * (1 - nouse)
|
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_spanning()
|
table = table.set_spanning()
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
table = table.set_border_edges()
|
table = table.set_border_edges()
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
table_data['split_text'] = []
|
table_data['split_text'] = []
|
||||||
|
|
@ -757,7 +758,7 @@ class Lattice:
|
||||||
flag_size=self.flag_size)
|
flag_size=self.flag_size)
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
assignment_errors.append(error)
|
assignment_errors.append(error)
|
||||||
indices = _reduce_index(table, indices, shift_text=self.shift_text)
|
indices = self._reduce_index(table, indices, shift_text=self.shift_text)
|
||||||
if len(indices) > 1:
|
if len(indices) > 1:
|
||||||
table_data['split_text'].append(indices)
|
table_data['split_text'].append(indices)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
|
|
@ -768,7 +769,7 @@ class Lattice:
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
|
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
table = _fill_spanning(table, fill=self.fill)
|
table = self._fill_spanning(table, fill=self.fill)
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
|
|
@ -782,6 +783,7 @@ class Lattice:
|
||||||
page[os.path.basename(bname)] = tables
|
page[os.path.basename(bname)] = tables
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
return None
|
g.tables = _tables
|
||||||
|
return [None], [g]
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
@ -0,0 +1,98 @@
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
|
from .handlers import PDFHandler
|
||||||
|
|
||||||
|
|
||||||
|
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
||||||
|
# explicit type conversion
|
||||||
|
p = PDFHandler(filepath, pages)
|
||||||
|
kwargs.update({'debug': geometry_type})
|
||||||
|
__, geometry = p.parse(mesh=mesh, **kwargs)
|
||||||
|
|
||||||
|
if geometry_type == 'text':
|
||||||
|
for text in geometry.text:
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
xs, ys = [], []
|
||||||
|
for t in text:
|
||||||
|
xs.extend([t[0], t[1]])
|
||||||
|
ys.extend([t[2], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
plt.show()
|
||||||
|
elif geometry_type == 'contour':
|
||||||
|
try:
|
||||||
|
for img, table_bbox in geometry.images:
|
||||||
|
for t in table_bbox.keys():
|
||||||
|
cv2.rectangle(img, (t[0], t[1]),
|
||||||
|
(t[2], t[3]), (255, 0, 0), 3)
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
|
elif geometry_type == 'joint':
|
||||||
|
try:
|
||||||
|
for img, table_bbox in geometry.images:
|
||||||
|
x_coord = []
|
||||||
|
y_coord = []
|
||||||
|
for k in table_bbox.keys():
|
||||||
|
for coord in table_bbox[k]:
|
||||||
|
x_coord.append(coord[0])
|
||||||
|
y_coord.append(coord[1])
|
||||||
|
max_x, max_y = max(x_coord), max(y_coord)
|
||||||
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
|
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
|
elif geometry_type == 'line':
|
||||||
|
try:
|
||||||
|
for v_s, h_s in geometry.segments:
|
||||||
|
for v in v_s:
|
||||||
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
for h in h_s:
|
||||||
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
|
elif geometry_type == 'table':
|
||||||
|
try:
|
||||||
|
for tables in geometry.tables:
|
||||||
|
for table in tables:
|
||||||
|
for r in range(len(table.rows)):
|
||||||
|
for c in range(len(table.cols)):
|
||||||
|
if table.cells[r][c].left:
|
||||||
|
plt.plot([table.cells[r][c].lb[0],
|
||||||
|
table.cells[r][c].lt[0]],
|
||||||
|
[table.cells[r][c].lb[1],
|
||||||
|
table.cells[r][c].lt[1]])
|
||||||
|
if table.cells[r][c].right:
|
||||||
|
plt.plot([table.cells[r][c].rb[0],
|
||||||
|
table.cells[r][c].rt[0]],
|
||||||
|
[table.cells[r][c].rb[1],
|
||||||
|
table.cells[r][c].rt[1]])
|
||||||
|
if table.cells[r][c].top:
|
||||||
|
plt.plot([table.cells[r][c].lt[0],
|
||||||
|
table.cells[r][c].rt[0]],
|
||||||
|
[table.cells[r][c].lt[1],
|
||||||
|
table.cells[r][c].rt[1]])
|
||||||
|
if table.cells[r][c].bottom:
|
||||||
|
plt.plot([table.cells[r][c].lb[0],
|
||||||
|
table.cells[r][c].rb[0]],
|
||||||
|
[table.cells[r][c].lb[1],
|
||||||
|
table.cells[r][c].rb[1]])
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
|
else:
|
||||||
|
raise UserWarning("This method can only be called after"
|
||||||
|
" debug has been specified.")
|
||||||
Loading…
Reference in New Issue