Replace chars with textlines
* Add split function * Add split_text and shift_text params * Change get_rotation * Move get_column_index to utils * Add split_text and shift_text * Fix split_textpull/2/head
parent
02ef332bd6
commit
a43d5ca2c7
|
|
@ -8,10 +8,10 @@ import subprocess
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints)
|
find_table_joints)
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (scale_to_pdf, scale_to_image, get_rotation, segments_bbox,
|
from .utils import (scale_to_pdf, scale_to_image, get_rotation, rotate_segments,
|
||||||
text_bbox, merge_close_values, get_row_index,
|
rotate_textlines, rotate_table, segments_bbox, text_in_bbox,
|
||||||
get_column_index, get_score, count_empty, encode_list,
|
merge_close_values, get_table_index, get_score, count_empty,
|
||||||
get_text_objects, get_page_layout)
|
encode_list, get_text_objects, get_page_layout)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Lattice']
|
__all__ = ['Lattice']
|
||||||
|
|
@ -25,6 +25,52 @@ def _reduce_method(m):
|
||||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
|
|
||||||
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
cell taking in account table rotation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : object
|
||||||
|
camelot.table.Table
|
||||||
|
|
||||||
|
idx : list
|
||||||
|
List of tuples of the form (r_idx, c_idx, text).
|
||||||
|
|
||||||
|
shift_text : list
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more from above and pass them as a list to
|
||||||
|
specify where the text in a spanning cell should flow.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
indices : list
|
||||||
|
List of tuples of the form (idx, text) where idx is the reduced
|
||||||
|
index of row/column and text is the an lttextline substring.
|
||||||
|
"""
|
||||||
|
indices = []
|
||||||
|
for r_idx, c_idx, text in idx:
|
||||||
|
for d in shift_text:
|
||||||
|
if d == 'l':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
|
while not t.cells[r_idx][c_idx].left:
|
||||||
|
c_idx -= 1
|
||||||
|
if d == 'r':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
|
while not t.cells[r_idx][c_idx].right:
|
||||||
|
c_idx += 1
|
||||||
|
if d == 't':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
|
while not t.cells[r_idx][c_idx].top:
|
||||||
|
r_idx -= 1
|
||||||
|
if d == 'b':
|
||||||
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
|
while not t.cells[r_idx][c_idx].bottom:
|
||||||
|
r_idx += 1
|
||||||
|
indices.append((r_idx, c_idx, text))
|
||||||
|
return indices
|
||||||
|
|
||||||
|
|
||||||
def _fill_spanning(t, fill=None):
|
def _fill_spanning(t, fill=None):
|
||||||
"""Fills spanning cells.
|
"""Fills spanning cells.
|
||||||
|
|
||||||
|
|
@ -67,78 +113,6 @@ def _fill_spanning(t, fill=None):
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def _outline(t):
|
|
||||||
"""Sets table border edges to True.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : object
|
|
||||||
camelot.table.Table
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : object
|
|
||||||
camelot.table.Table
|
|
||||||
"""
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
t.cells[i][0].left = True
|
|
||||||
t.cells[i][len(t.cells[i]) - 1].right = True
|
|
||||||
for i in range(len(t.cells[0])):
|
|
||||||
t.cells[0][i].top = True
|
|
||||||
t.cells[len(t.cells) - 1][i].bottom = True
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
def _reduce_index(t, rotation, r_idx, c_idx):
|
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
|
||||||
cell taking in account table rotation.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : object
|
|
||||||
camelot.table.Table
|
|
||||||
|
|
||||||
rotation : string
|
|
||||||
{'', 'left', 'right'}
|
|
||||||
|
|
||||||
r_idx : int
|
|
||||||
Current row index.
|
|
||||||
|
|
||||||
c_idx : int
|
|
||||||
Current column index.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
r_idx : int
|
|
||||||
Reduced row index.
|
|
||||||
|
|
||||||
c_idx : int
|
|
||||||
Reduced column index.
|
|
||||||
"""
|
|
||||||
if not rotation:
|
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
|
||||||
while not t.cells[r_idx][c_idx].left:
|
|
||||||
c_idx -= 1
|
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
|
||||||
while not t.cells[r_idx][c_idx].top:
|
|
||||||
r_idx -= 1
|
|
||||||
elif rotation == 'left':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
|
||||||
while not t.cells[r_idx][c_idx].left:
|
|
||||||
c_idx -= 1
|
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
|
||||||
r_idx += 1
|
|
||||||
elif rotation == 'right':
|
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
|
||||||
while not t.cells[r_idx][c_idx].right:
|
|
||||||
c_idx += 1
|
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
|
||||||
while not t.cells[r_idx][c_idx].top:
|
|
||||||
r_idx -= 1
|
|
||||||
return r_idx, c_idx
|
|
||||||
|
|
||||||
|
|
||||||
class Lattice:
|
class Lattice:
|
||||||
"""Lattice looks for lines in the pdf to form a table.
|
"""Lattice looks for lines in the pdf to form a table.
|
||||||
|
|
||||||
|
|
@ -179,6 +153,17 @@ class Lattice:
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
(optional, default: (1.0, 0.5, 0.1))
|
(optional, default: (1.0, 0.5, 0.1))
|
||||||
|
|
||||||
|
split_text : bool
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
different cells.
|
||||||
|
(optional, default: False)
|
||||||
|
|
||||||
|
shift_text : list
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more from above and pass them as a list to
|
||||||
|
specify where the text in a spanning cell should flow.
|
||||||
|
(optional, default: ['l', 't'])
|
||||||
|
|
||||||
debug : string
|
debug : string
|
||||||
{'contour', 'line', 'joint', 'table'}
|
{'contour', 'line', 'joint', 'table'}
|
||||||
Set to one of the above values to generate a matplotlib plot
|
Set to one of the above values to generate a matplotlib plot
|
||||||
|
|
@ -186,7 +171,8 @@ class Lattice:
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
|
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
|
||||||
invert=False, margins=(1.0, 0.5, 0.1), debug=None):
|
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
|
||||||
|
shift_text=['l', 't'], debug=None):
|
||||||
|
|
||||||
self.method = 'lattice'
|
self.method = 'lattice'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
|
|
@ -195,6 +181,8 @@ class Lattice:
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
|
self.split_text = split_text
|
||||||
|
self.shift_text = shift_text
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
|
|
@ -211,9 +199,9 @@ class Lattice:
|
||||||
"""
|
"""
|
||||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||||
ltchar = get_text_objects(layout, LTType="char")
|
lttextlh = get_text_objects(layout, ltype="lh")
|
||||||
lttextlh = get_text_objects(layout, LTType="lh")
|
lttextlv = get_text_objects(layout, ltype="lv")
|
||||||
lttextlv = get_text_objects(layout, LTType="lv")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
width, height = dim
|
width, height = dim
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
if not ltchar:
|
if not ltchar:
|
||||||
|
|
@ -287,11 +275,15 @@ class Lattice:
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
table_data = {}
|
table_data = {}
|
||||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||||
char_bbox = text_bbox(k, ltchar)
|
lh_bbox = text_in_bbox(k, lttextlh)
|
||||||
lh_bbox = text_bbox(k, lttextlh)
|
lv_bbox = text_in_bbox(k, lttextlv)
|
||||||
lv_bbox = text_bbox(k, lttextlv)
|
char_bbox = text_in_bbox(k, ltchar)
|
||||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||||
table_rotation = get_rotation(char_bbox, lh_bbox, lv_bbox)
|
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
||||||
|
v_s, h_s = rotate_segments(v_s, h_s, table_rotation)
|
||||||
|
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
||||||
|
for direction in t_bbox:
|
||||||
|
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
cols, rows = zip(*table_bbox[k])
|
cols, rows = zip(*table_bbox[k])
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
cols.extend([k[0], k[2]])
|
cols.extend([k[0], k[2]])
|
||||||
|
|
@ -305,6 +297,7 @@ class Lattice:
|
||||||
for i in range(0, len(cols) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
rows = [(rows[i], rows[i + 1])
|
rows = [(rows[i], rows[i + 1])
|
||||||
for i in range(0, len(rows) - 1)]
|
for i in range(0, len(rows) - 1)]
|
||||||
|
rows, cols = rotate_table(rows, cols, table_rotation)
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s)
|
table = table.set_edges(v_s, h_s)
|
||||||
|
|
@ -313,58 +306,26 @@ class Lattice:
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_spanning()
|
table = table.set_spanning()
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
table = _outline(table)
|
table = table.set_border_edges()
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_tables.append(table)
|
self.debug_tables.append(table)
|
||||||
|
|
||||||
rerror = []
|
assignment_errors = []
|
||||||
cerror = []
|
for direction in t_bbox:
|
||||||
for t in char_bbox:
|
for t in t_bbox[direction]:
|
||||||
try:
|
indices, error = get_table_index(
|
||||||
r_idx, rass_error = get_row_index(t, rows)
|
table, t, direction, split_text=self.split_text)
|
||||||
except TypeError:
|
assignment_errors.append(error)
|
||||||
# couldn't assign LTChar to any cell
|
indices = _reduce_index(table, indices, shift_text=self.shift_text)
|
||||||
continue
|
for r_idx, c_idx, text in indices:
|
||||||
try:
|
table.cells[r_idx][c_idx].add_text(text)
|
||||||
c_idx, cass_error = get_column_index(t, cols)
|
score = get_score([[100, assignment_errors]])
|
||||||
except TypeError:
|
|
||||||
# couldn't assign LTChar to any cell
|
|
||||||
continue
|
|
||||||
rerror.append(rass_error)
|
|
||||||
cerror.append(cass_error)
|
|
||||||
r_idx, c_idx = _reduce_index(table, table_rotation, r_idx, c_idx)
|
|
||||||
table.cells[r_idx][c_idx].add_object(t)
|
|
||||||
|
|
||||||
for i in range(len(table.cells)):
|
|
||||||
for j in range(len(table.cells[i])):
|
|
||||||
t_bbox = table.cells[i][j].get_objects()
|
|
||||||
try:
|
|
||||||
cell_rotation = get_rotation(t_bbox)
|
|
||||||
except ZeroDivisionError:
|
|
||||||
cell_rotation = ''
|
|
||||||
pass
|
|
||||||
# fill text after sorting it
|
|
||||||
if cell_rotation == '':
|
|
||||||
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
|
||||||
elif cell_rotation == 'left':
|
|
||||||
t_bbox.sort(key=lambda x: (x.x0, x.y0))
|
|
||||||
elif cell_rotation == 'right':
|
|
||||||
t_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
|
||||||
table.cells[i][j].add_text(''.join([t.get_text()
|
|
||||||
for t in t_bbox]))
|
|
||||||
|
|
||||||
score = get_score([[50, rerror], [50, cerror]])
|
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
|
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
table = _fill_spanning(table, fill=self.fill[table_no])
|
table = _fill_spanning(table, fill=self.fill[table_no])
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
if table_rotation == 'left':
|
|
||||||
ar = zip(*ar[::-1])
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
ar = zip(*ar[::1])
|
|
||||||
ar.reverse()
|
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
|
|
||||||
|
|
@ -196,28 +196,28 @@ class Pdf:
|
||||||
try:
|
try:
|
||||||
for tables in self.debug_tables:
|
for tables in self.debug_tables:
|
||||||
for table in tables:
|
for table in tables:
|
||||||
for i in range(len(table.cells)):
|
for r in range(len(table.rows)):
|
||||||
for j in range(len(table.cells[i])):
|
for c in range(len(table.cols)):
|
||||||
if table.cells[i][j].left:
|
if table.cells[r][c].left:
|
||||||
plt.plot([table.cells[i][j].lb[0],
|
plt.plot([table.cells[r][c].lb[0],
|
||||||
table.cells[i][j].lt[0]],
|
table.cells[r][c].lt[0]],
|
||||||
[table.cells[i][j].lb[1],
|
[table.cells[r][c].lb[1],
|
||||||
table.cells[i][j].lt[1]])
|
table.cells[r][c].lt[1]])
|
||||||
if table.cells[i][j].right:
|
if table.cells[r][c].right:
|
||||||
plt.plot([table.cells[i][j].rb[0],
|
plt.plot([table.cells[r][c].rb[0],
|
||||||
table.cells[i][j].rt[0]],
|
table.cells[r][c].rt[0]],
|
||||||
[table.cells[i][j].rb[1],
|
[table.cells[r][c].rb[1],
|
||||||
table.cells[i][j].rt[1]])
|
table.cells[r][c].rt[1]])
|
||||||
if table.cells[i][j].top:
|
if table.cells[r][c].top:
|
||||||
plt.plot([table.cells[i][j].lt[0],
|
plt.plot([table.cells[r][c].lt[0],
|
||||||
table.cells[i][j].rt[0]],
|
table.cells[r][c].rt[0]],
|
||||||
[table.cells[i][j].lt[1],
|
[table.cells[r][c].lt[1],
|
||||||
table.cells[i][j].rt[1]])
|
table.cells[r][c].rt[1]])
|
||||||
if table.cells[i][j].bottom:
|
if table.cells[r][c].bottom:
|
||||||
plt.plot([table.cells[i][j].lb[0],
|
plt.plot([table.cells[r][c].lb[0],
|
||||||
table.cells[i][j].rb[0]],
|
table.cells[r][c].rb[0]],
|
||||||
[table.cells[i][j].lb[1],
|
[table.cells[r][c].lb[1],
|
||||||
table.cells[i][j].rb[1]])
|
table.cells[r][c].rb[1]])
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError("This option only be used with Lattice.")
|
raise ValueError("This option only be used with Lattice.")
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,9 @@ import copy_reg
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (rotate, get_rotation, text_bbox, get_row_index, get_score,
|
from .utils import (rotate, get_rotation, rotate_textlines, text_in_bbox,
|
||||||
count_empty, encode_list, get_text_objects, get_page_layout)
|
get_table_index, get_score, count_empty, encode_list,
|
||||||
|
get_text_objects, get_page_layout)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Stream']
|
__all__ = ['Stream']
|
||||||
|
|
@ -22,6 +23,29 @@ def _reduce_method(m):
|
||||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
|
|
||||||
|
def _text_bbox(t_bbox):
|
||||||
|
"""Returns bounding box for the text present on a page.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||||
|
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text_bbox : tuple
|
||||||
|
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
|
||||||
|
space.
|
||||||
|
"""
|
||||||
|
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
text_bbox = (xmin, ymin, xmax, ymax)
|
||||||
|
return text_bbox
|
||||||
|
|
||||||
|
|
||||||
def _group_rows(text, ytol=2):
|
def _group_rows(text, ytol=2):
|
||||||
"""Groups PDFMiner text objects into rows using their
|
"""Groups PDFMiner text objects into rows using their
|
||||||
y-coordinates taking into account some tolerance ytol.
|
y-coordinates taking into account some tolerance ytol.
|
||||||
|
|
@ -185,45 +209,6 @@ def _add_columns(cols, text, ytol):
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
|
|
||||||
def _get_column_index(t, columns):
|
|
||||||
"""Gets index of the column in which the given text object lies by
|
|
||||||
comparing their x-coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : object
|
|
||||||
|
|
||||||
columns : list
|
|
||||||
List of column coordinate tuples.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
c_idx : int
|
|
||||||
|
|
||||||
error : float
|
|
||||||
"""
|
|
||||||
offset1, offset2 = 0, 0
|
|
||||||
lt_col_overlap = []
|
|
||||||
for c in columns:
|
|
||||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
|
||||||
left = t.x0 if c[0] <= t.x0 else c[0]
|
|
||||||
right = t.x1 if c[1] >= t.x1 else c[1]
|
|
||||||
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
|
|
||||||
else:
|
|
||||||
lt_col_overlap.append(-1)
|
|
||||||
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
|
||||||
logging.warning("Text doesn't fit any column.")
|
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
|
||||||
if t.x0 < columns[c_idx][0]:
|
|
||||||
offset1 = abs(t.x0 - columns[c_idx][0])
|
|
||||||
if t.x1 > columns[c_idx][1]:
|
|
||||||
offset2 = abs(t.x1 - columns[c_idx][1])
|
|
||||||
Y = abs(t.y0 - t.y1)
|
|
||||||
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
|
|
||||||
error = (Y * (offset1 + offset2)) / charea
|
|
||||||
return c_idx, error
|
|
||||||
|
|
||||||
|
|
||||||
class Stream:
|
class Stream:
|
||||||
"""Stream looks for spaces between text elements to form a table.
|
"""Stream looks for spaces between text elements to form a table.
|
||||||
|
|
||||||
|
|
@ -265,13 +250,19 @@ class Stream:
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
(optional, default: (1.0, 0.5, 0.1))
|
(optional, default: (1.0, 0.5, 0.1))
|
||||||
|
|
||||||
|
split_text : bool
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
different cells.
|
||||||
|
(optional, default: False)
|
||||||
|
|
||||||
debug : bool
|
debug : bool
|
||||||
Set to True to generate a matplotlib plot of
|
Set to True to generate a matplotlib plot of
|
||||||
LTTextLineHorizontals in order to select table_area, columns.
|
LTTextLineHorizontals in order to select table_area, columns.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
|
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
|
||||||
mtol=[0], margins=(1.0, 0.5, 0.1), debug=False):
|
mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False,
|
||||||
|
debug=False):
|
||||||
|
|
||||||
self.method = 'stream'
|
self.method = 'stream'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
|
|
@ -280,6 +271,7 @@ class Stream:
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
|
self.split_text = split_text
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
|
|
@ -296,9 +288,9 @@ class Stream:
|
||||||
"""
|
"""
|
||||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||||
ltchar = get_text_objects(layout, LTType="char")
|
lttextlh = get_text_objects(layout, ltype="lh")
|
||||||
lttextlh = get_text_objects(layout, LTType="lh")
|
lttextlv = get_text_objects(layout, ltype="lv")
|
||||||
lttextlv = get_text_objects(layout, LTType="lv")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
width, height = dim
|
width, height = dim
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
if not lttextlh:
|
if not lttextlh:
|
||||||
|
|
@ -308,6 +300,8 @@ class Stream:
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_text = []
|
self.debug_text = []
|
||||||
|
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
||||||
|
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.columns is not None:
|
if self.columns is not None:
|
||||||
|
|
@ -339,34 +333,16 @@ class Stream:
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
table_data = {}
|
table_data = {}
|
||||||
table_rotation = get_rotation(ltchar, lttextlh, lttextlv)
|
lh_bbox = text_in_bbox(k, lttextlh)
|
||||||
if table_rotation != '':
|
lv_bbox = text_in_bbox(k, lttextlv)
|
||||||
t_bbox = text_bbox(k, lttextlv)
|
char_bbox = text_in_bbox(k, ltchar)
|
||||||
if table_rotation == 'left':
|
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||||
if self.debug:
|
table_rotation = get_rotation(lh_bbox, lv_bbox, char_bbox)
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
t_bbox = rotate_textlines(lh_bbox, lv_bbox, table_rotation)
|
||||||
for t in t_bbox:
|
for direction in t_bbox:
|
||||||
x0, y0, x1, y1 = t.bbox
|
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
||||||
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no])
|
||||||
t.set_bbox((x0, y1, x1, y0))
|
|
||||||
elif table_rotation == 'right':
|
|
||||||
for t in t_bbox:
|
|
||||||
x0, y0, x1, y1 = t.bbox
|
|
||||||
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
|
||||||
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
|
||||||
t.set_bbox((x1, y0, x0, y1))
|
|
||||||
else:
|
|
||||||
if self.debug:
|
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
|
||||||
t_bbox = text_bbox(k, lttextlh)
|
|
||||||
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
|
||||||
|
|
||||||
text_x_min = min([t.x0 for t in t_bbox])
|
|
||||||
text_y_min = min([t.y0 for t in t_bbox])
|
|
||||||
text_x_max = max([t.x1 for t in t_bbox])
|
|
||||||
text_y_max = max([t.y1 for t in t_bbox])
|
|
||||||
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
|
|
||||||
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
|
|
@ -402,9 +378,9 @@ class Stream:
|
||||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||||
if ncols == 1 and not self.debug:
|
if ncols == 1 and not self.debug:
|
||||||
# no tables detected
|
# no tables detected
|
||||||
logging.warning("{}: Only one column was detected, the PDF"
|
logging.warning("{}: Only one column was detected, the pdf"
|
||||||
" may have no tables. Specify ncols if"
|
" may have no tables. Specify ncols if"
|
||||||
" the PDF has tables.".format(
|
" the pdf has tables.".format(
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
cols = [(t.x0, t.x1)
|
cols = [(t.x0, t.x1)
|
||||||
for r in rows_grouped if len(r) == ncols for t in r]
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
|
|
@ -413,35 +389,30 @@ class Stream:
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
right = cols[i][0]
|
right = cols[i][0]
|
||||||
inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
|
inner_text.extend([t for direction in t_bbox
|
||||||
outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
for t in t_bbox[direction]
|
||||||
|
if t.x0 > left and t.x1 < right])
|
||||||
|
outer_text = [t for direction in t_bbox
|
||||||
|
for t in t_bbox[direction]
|
||||||
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||||
inner_text.extend(outer_text)
|
inner_text.extend(outer_text)
|
||||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
rerror = []
|
table = table.set_all_edges()
|
||||||
cerror = []
|
assignment_errors = []
|
||||||
for row in rows_grouped:
|
for direction in t_bbox:
|
||||||
for t in row:
|
for t in t_bbox[direction]:
|
||||||
try:
|
indices, error = get_table_index(
|
||||||
r_idx, rass_error = get_row_index(t, rows)
|
table, t, direction, split_text=self.split_text)
|
||||||
except ValueError as e:
|
assignment_errors.append(error)
|
||||||
# couldn't assign LTTextLH to any cell
|
for r_idx, c_idx, text in indices:
|
||||||
continue
|
table.cells[r_idx][c_idx].add_text(text)
|
||||||
try:
|
|
||||||
c_idx, cass_error = _get_column_index(t, cols)
|
|
||||||
except ValueError as e:
|
|
||||||
# couldn't assign LTTextLH to any cell
|
|
||||||
continue
|
|
||||||
rerror.append(rass_error)
|
|
||||||
cerror.append(cass_error)
|
|
||||||
table.cells[r_idx][c_idx].add_text(
|
|
||||||
t.get_text().strip('\n'))
|
|
||||||
if guess:
|
if guess:
|
||||||
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
|
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
|
||||||
else:
|
else:
|
||||||
score = get_score([[50, rerror], [50, cerror]])
|
score = get_score([[100, assignment_errors]])
|
||||||
|
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,28 @@ class Table:
|
||||||
for c in cols] for r in rows]
|
for c in cols] for r in rows]
|
||||||
self.nocont_ = 0
|
self.nocont_ = 0
|
||||||
|
|
||||||
|
def set_all_edges(self):
|
||||||
|
"""Sets all table edges to True.
|
||||||
|
"""
|
||||||
|
for r in range(len(self.rows)):
|
||||||
|
for c in range(len(self.cols)):
|
||||||
|
self.cells[r][c].left = True
|
||||||
|
self.cells[r][c].right = True
|
||||||
|
self.cells[r][c].top = True
|
||||||
|
self.cells[r][c].bottom = True
|
||||||
|
return self
|
||||||
|
|
||||||
|
def set_border_edges(self):
|
||||||
|
"""Sets table border edges to True.
|
||||||
|
"""
|
||||||
|
for r in range(len(self.rows)):
|
||||||
|
self.cells[r][0].left = True
|
||||||
|
self.cells[r][len(self.cols) - 1].right = True
|
||||||
|
for c in range(len(self.cols)):
|
||||||
|
self.cells[0][c].top = True
|
||||||
|
self.cells[len(self.rows) - 1][c].bottom = True
|
||||||
|
return self
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, jtol=2):
|
def set_edges(self, vertical, horizontal, jtol=2):
|
||||||
"""Sets a cell's edges to True depending on whether they
|
"""Sets a cell's edges to True depending on whether they
|
||||||
overlap with lines found by imgproc.
|
overlap with lines found by imgproc.
|
||||||
|
|
@ -160,47 +182,47 @@ class Table:
|
||||||
depending on whether the cell spans/extends horizontally or
|
depending on whether the cell spans/extends horizontally or
|
||||||
vertically.
|
vertically.
|
||||||
"""
|
"""
|
||||||
for i in range(len(self.cells)):
|
for r in range(len(self.rows)):
|
||||||
for j in range(len(self.cells[i])):
|
for c in range(len(self.cols)):
|
||||||
bound = self.cells[i][j].get_bounded_edges()
|
bound = self.cells[r][c].get_bounded_edges()
|
||||||
if bound == 4:
|
if bound == 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif bound == 3:
|
elif bound == 3:
|
||||||
if not self.cells[i][j].left:
|
if not self.cells[r][c].left:
|
||||||
if (self.cells[i][j].right and
|
if (self.cells[r][c].right and
|
||||||
self.cells[i][j].top and
|
self.cells[r][c].top and
|
||||||
self.cells[i][j].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[r][c].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[i][j].right:
|
elif not self.cells[r][c].right:
|
||||||
if (self.cells[i][j].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[i][j].top and
|
self.cells[r][c].top and
|
||||||
self.cells[i][j].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[r][c].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[i][j].top:
|
elif not self.cells[r][c].top:
|
||||||
if (self.cells[i][j].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[i][j].right and
|
self.cells[r][c].right and
|
||||||
self.cells[i][j].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif not self.cells[i][j].bottom:
|
elif not self.cells[r][c].bottom:
|
||||||
if (self.cells[i][j].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[i][j].right and
|
self.cells[r][c].right and
|
||||||
self.cells[i][j].top):
|
self.cells[r][c].top):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif bound == 2:
|
elif bound == 2:
|
||||||
if self.cells[i][j].left and self.cells[i][j].right:
|
if self.cells[r][c].left and self.cells[r][c].right:
|
||||||
if (not self.cells[i][j].top and
|
if (not self.cells[r][c].top and
|
||||||
not self.cells[i][j].bottom):
|
not self.cells[r][c].bottom):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif self.cells[i][j].top and self.cells[i][j].bottom:
|
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
||||||
if (not self.cells[i][j].left and
|
if (not self.cells[r][c].left and
|
||||||
not self.cells[i][j].right):
|
not self.cells[r][c].right):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[r][c].spanning_h = True
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
@ -213,7 +235,7 @@ class Table:
|
||||||
ar : list
|
ar : list
|
||||||
"""
|
"""
|
||||||
ar = []
|
ar = []
|
||||||
for i in range(len(self.cells)):
|
for r in range(len(self.rows)):
|
||||||
ar.append([self.cells[i][j].get_text().strip()
|
ar.append([self.cells[r][c].get_text().strip()
|
||||||
for j in range(len(self.cells[i]))])
|
for c in range(len(self.cols))])
|
||||||
return ar
|
return ar
|
||||||
|
|
|
||||||
382
camelot/utils.py
382
camelot/utils.py
|
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
@ -11,7 +12,8 @@ from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
|
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
|
LTTextLineVertical)
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
|
|
@ -174,22 +176,20 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
||||||
return tables_new, v_segments_new, h_segments_new
|
return tables_new, v_segments_new, h_segments_new
|
||||||
|
|
||||||
|
|
||||||
def get_rotation(ltchar, lttextlh=None, lttextlv=None):
|
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
"""Detects if text in table is vertical or not using the current
|
"""Detects if text in table is vertical or not using the current
|
||||||
transformation matrix (CTM) and returns its orientation.
|
transformation matrix (CTM) and returns its orientation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
ltchar : list
|
|
||||||
List of PDFMiner LTChar objects.
|
|
||||||
|
|
||||||
lttextlh : list
|
lttextlh : list
|
||||||
List of PDFMiner LTTextLineHorizontal objects.
|
List of PDFMiner LTTextLineHorizontal objects.
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
lttextlv : list
|
lttextlv : list
|
||||||
List of PDFMiner LTTextLineVertical objects.
|
List of PDFMiner LTTextLineVertical objects.
|
||||||
(optional, default: None)
|
|
||||||
|
ltchar : list
|
||||||
|
List of PDFMiner LTChar objects.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -199,15 +199,9 @@ def get_rotation(ltchar, lttextlh=None, lttextlv=None):
|
||||||
anti-clockwise and 'right' if rotated 90 degree clockwise.
|
anti-clockwise and 'right' if rotated 90 degree clockwise.
|
||||||
"""
|
"""
|
||||||
rotation = ''
|
rotation = ''
|
||||||
if lttextlh is not None and lttextlv is not None:
|
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
||||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
||||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
if hlen < vlen:
|
||||||
vger = 0.0
|
|
||||||
else:
|
|
||||||
hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
|
|
||||||
vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
|
|
||||||
vger = vlen / float(hlen+vlen)
|
|
||||||
if hlen < vlen or vger > 0.8:
|
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||||
rotation = 'left' if clockwise < anticlockwise else 'right'
|
rotation = 'left' if clockwise < anticlockwise else 'right'
|
||||||
|
|
@ -247,7 +241,183 @@ def segments_bbox(bbox, v_segments, h_segments):
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
def text_bbox(bbox, text):
|
def rotate_segments(v_s, h_s, table_rotation):
|
||||||
|
"""Rotates line segments if the table is rotated.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
v : list
|
||||||
|
List of vertical line segments.
|
||||||
|
|
||||||
|
h : list
|
||||||
|
List of horizontal line segments.
|
||||||
|
|
||||||
|
table_rotation : string
|
||||||
|
{'', 'left', 'right'}
|
||||||
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
vertical : list
|
||||||
|
List of rotated vertical line segments.
|
||||||
|
|
||||||
|
horizontal : list
|
||||||
|
List of rotated horizontal line segments.
|
||||||
|
"""
|
||||||
|
vertical, horizontal = [], []
|
||||||
|
if table_rotation != '':
|
||||||
|
if table_rotation == 'left':
|
||||||
|
for v in v_s:
|
||||||
|
x0, y0 = rotate(0, 0, v[0], v[1], -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, v[2], v[3], -np.pi / 2)
|
||||||
|
horizontal.append((x0, y0, x1, y1))
|
||||||
|
for h in h_s:
|
||||||
|
x0, y0 = rotate(0, 0, h[0], h[1], -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, h[2], h[3], -np.pi / 2)
|
||||||
|
vertical.append((x1, y1, x0, y0))
|
||||||
|
elif table_rotation == 'right':
|
||||||
|
for v in v_s:
|
||||||
|
x0, y0 = rotate(0, 0, v[0], v[1], np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, v[2], v[3], np.pi / 2)
|
||||||
|
horizontal.append((x1, y1, x0, y0))
|
||||||
|
for h in h_s:
|
||||||
|
x0, y0 = rotate(0, 0, h[0], h[1], np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, h[2], h[3], np.pi / 2)
|
||||||
|
vertical.append((x0, y0, x1, y1))
|
||||||
|
else:
|
||||||
|
vertical = v_s
|
||||||
|
horizontal = h_s
|
||||||
|
return vertical, horizontal
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_textlines(lh_bbox, lv_bbox, table_rotation):
|
||||||
|
"""Rotates bounding boxes of LTTextLineHorizontals and
|
||||||
|
LTTextLineVerticals if the table is rotated.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
lh_bbox : list
|
||||||
|
List of PDFMiner LTTextLineHorizontal objects.
|
||||||
|
|
||||||
|
lv_bbox : list
|
||||||
|
List of PDFMiner LTTextLineVertical objects.
|
||||||
|
|
||||||
|
table_rotation : string
|
||||||
|
{'', 'left', 'right'}
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||||
|
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||||
|
"""
|
||||||
|
t_bbox = {}
|
||||||
|
if table_rotation != '':
|
||||||
|
if table_rotation == 'left':
|
||||||
|
for t in lh_bbox:
|
||||||
|
x0, y0, x1, y1 = t.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||||
|
t.set_bbox((x1, y0, x0, y1))
|
||||||
|
for obj in t._objs:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
x0, y0, x1, y1 = obj.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||||
|
obj.set_bbox((x1, y0, x0, y1))
|
||||||
|
for t in lv_bbox:
|
||||||
|
x0, y0, x1, y1 = t.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||||
|
t.set_bbox((x0, y1, x1, y0))
|
||||||
|
for obj in t._objs:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
x0, y0, x1, y1 = obj.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, -np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, -np.pi / 2)
|
||||||
|
obj.set_bbox((x0, y1, x1, y0))
|
||||||
|
elif table_rotation == 'right':
|
||||||
|
for t in lh_bbox:
|
||||||
|
x0, y0, x1, y1 = t.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||||
|
t.set_bbox((x0, y1, x1, y0))
|
||||||
|
for obj in t._objs:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
x0, y0, x1, y1 = obj.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||||
|
obj.set_bbox((x0, y1, x1, y0))
|
||||||
|
for t in lv_bbox:
|
||||||
|
x0, y0, x1, y1 = t.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||||
|
t.set_bbox((x1, y0, x0, y1))
|
||||||
|
for obj in t._objs:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
x0, y0, x1, y1 = obj.bbox
|
||||||
|
x0, y0 = rotate(0, 0, x0, y0, np.pi / 2)
|
||||||
|
x1, y1 = rotate(0, 0, x1, y1, np.pi / 2)
|
||||||
|
obj.set_bbox((x1, y0, x0, y1))
|
||||||
|
t_bbox['horizontal'] = lv_bbox
|
||||||
|
t_bbox['vertical'] = lh_bbox
|
||||||
|
else:
|
||||||
|
t_bbox['horizontal'] = lh_bbox
|
||||||
|
t_bbox['vertical'] = lv_bbox
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_table(R, C, table_rotation):
|
||||||
|
"""Rotates coordinates of table rows and columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
R : list
|
||||||
|
List of row x-coordinates.
|
||||||
|
|
||||||
|
C : list
|
||||||
|
List of column y-coordinates.
|
||||||
|
|
||||||
|
table_rotation : string
|
||||||
|
{'', 'left', 'right'}
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
List of rotated row x-coordinates.
|
||||||
|
|
||||||
|
cols : list
|
||||||
|
List of rotated column y-coordinates.
|
||||||
|
"""
|
||||||
|
rows, cols = [], []
|
||||||
|
if table_rotation != '':
|
||||||
|
if table_rotation == 'left':
|
||||||
|
for r in R:
|
||||||
|
r0, r1 = rotate(0, 0, 0, r[0], -np.pi / 2)
|
||||||
|
r2, r3 = rotate(0, 0, 0, r[1], -np.pi / 2)
|
||||||
|
cols.append((r2, r0))
|
||||||
|
cols = sorted(cols)
|
||||||
|
for c in C:
|
||||||
|
c0, c1 = rotate(0, 0, c[0], 0, -np.pi / 2)
|
||||||
|
c2, c3 = rotate(0, 0, c[1], 0, -np.pi / 2)
|
||||||
|
rows.append((c1, c3))
|
||||||
|
elif table_rotation == 'right':
|
||||||
|
for r in R:
|
||||||
|
r0, r1 = rotate(0, 0, 0, r[0], np.pi / 2)
|
||||||
|
r2, r3 = rotate(0, 0, 0, r[1], np.pi / 2)
|
||||||
|
cols.append((r0, r2))
|
||||||
|
for c in C:
|
||||||
|
c0, c1 = rotate(0, 0, c[0], 0, np.pi / 2)
|
||||||
|
c2, c3 = rotate(0, 0, c[1], 0, np.pi / 2)
|
||||||
|
rows.append((c3, c1))
|
||||||
|
rows = sorted(rows, reverse=True)
|
||||||
|
else:
|
||||||
|
rows = R
|
||||||
|
cols = C
|
||||||
|
return rows, cols
|
||||||
|
|
||||||
|
|
||||||
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a
|
"""Returns all text objects present inside a
|
||||||
table's bounding box.
|
table's bounding box.
|
||||||
|
|
||||||
|
|
@ -330,66 +500,141 @@ def merge_close_values(ar, mtol=2):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def get_row_index(t, rows):
|
def split_textline(table, textline, direction):
|
||||||
"""Gets index of the row in which the given text object lies by
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
comparing their y-coordinates.
|
multiple rows/columns.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
t : object
|
table : object
|
||||||
|
camelot.pdf.Pdf
|
||||||
|
|
||||||
rows : list
|
textline : object
|
||||||
List of row coordinate tuples, sorted in decreasing order.
|
PDFMiner LTTextLine object.
|
||||||
|
|
||||||
|
direction : string
|
||||||
|
{'horizontal', 'vertical'}
|
||||||
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
r : int
|
cut_text : list
|
||||||
|
List of tuples of the form (idx, text) where idx is the index
|
||||||
error : float
|
of row/column and text is the an lttextline substring.
|
||||||
"""
|
"""
|
||||||
offset1, offset2 = 0, 0
|
idx = 0
|
||||||
for r in range(len(rows)):
|
cut_text = []
|
||||||
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
bbox = textline.bbox
|
||||||
if t.y0 > rows[r][0]:
|
if direction == 'horizontal' and not textline.is_empty():
|
||||||
offset1 = abs(t.y0 - rows[r][0])
|
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
|
||||||
if t.y1 < rows[r][1]:
|
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
|
||||||
offset2 = abs(t.y1 - rows[r][1])
|
r = r_idx[0]
|
||||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
|
||||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
if not x_cuts:
|
||||||
charea = X * Y
|
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||||
error = (X * (offset1 + offset2)) / charea
|
for obj in textline._objs:
|
||||||
return r, error
|
row = table.rows[r]
|
||||||
|
for cut in x_cuts:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
||||||
|
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
||||||
|
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
|
||||||
|
break
|
||||||
|
elif isinstance(obj, LTAnno):
|
||||||
|
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
|
||||||
|
elif direction == 'vertical' and not textline.is_empty():
|
||||||
|
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
||||||
|
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
||||||
|
c = c_idx[0]
|
||||||
|
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
|
||||||
|
if not y_cuts:
|
||||||
|
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||||
|
for obj in textline._objs:
|
||||||
|
col = table.cols[c]
|
||||||
|
for cut in y_cuts:
|
||||||
|
if isinstance(obj, LTChar):
|
||||||
|
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
|
||||||
|
(obj.y0 + obj.y1) / 2 >= cut[1]):
|
||||||
|
cut_text.append((cut[0], c, obj.get_text()))
|
||||||
|
break
|
||||||
|
elif isinstance(obj, LTAnno):
|
||||||
|
cut_text.append((cut[0], c, obj.get_text().strip('\n')))
|
||||||
|
return cut_text
|
||||||
|
|
||||||
|
|
||||||
def get_column_index(t, columns):
|
def get_table_index(table, t, direction, split_text=False):
|
||||||
"""Gets index of the column in which the given text object lies by
|
"""Gets indices of the cell where given text object lies by
|
||||||
comparing their x-coordinates.
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
t : object
|
table : object
|
||||||
|
camelot.table.Table
|
||||||
|
|
||||||
columns : list
|
t : object
|
||||||
List of column coordinate tuples.
|
PDFMiner LTTextLine object.
|
||||||
|
|
||||||
|
direction : string
|
||||||
|
{'horizontal', 'vertical'}
|
||||||
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
|
||||||
|
split_text : bool
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
(optional, default: False)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
c : int
|
indices : list
|
||||||
|
List of tuples of the form (idx, text) where idx is the index
|
||||||
|
of row/column and text is the an lttextline substring.
|
||||||
|
|
||||||
error : float
|
error : float
|
||||||
|
Assignment error, percentage of text area that lies outside
|
||||||
|
a cell.
|
||||||
|
+-------+
|
||||||
|
| |
|
||||||
|
| [Text bounding box]
|
||||||
|
| |
|
||||||
|
+-------+
|
||||||
"""
|
"""
|
||||||
offset1, offset2 = 0, 0
|
r_idx, c_idx = [-1] * 2
|
||||||
for c in range(len(columns)):
|
for r in range(len(table.rows)):
|
||||||
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
|
||||||
if t.x0 < columns[c][0]:
|
(t.y0 + t.y1) / 2.0 > table.rows[r][1]):
|
||||||
offset1 = abs(t.x0 - columns[c][0])
|
lt_col_overlap = []
|
||||||
if t.x1 > columns[c][1]:
|
for c in table.cols:
|
||||||
offset2 = abs(t.x1 - columns[c][1])
|
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
left = t.x0 if c[0] <= t.x0 else c[0]
|
||||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
right = t.x1 if c[1] >= t.x1 else c[1]
|
||||||
charea = X * Y
|
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
|
||||||
error = (Y * (offset1 + offset2)) / charea
|
else:
|
||||||
return c, error
|
lt_col_overlap.append(-1)
|
||||||
|
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||||
|
logging.warning("Text doesn't fit any column.")
|
||||||
|
r_idx = r
|
||||||
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
|
break
|
||||||
|
|
||||||
|
# error calculation
|
||||||
|
y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
|
||||||
|
if t.y0 > table.rows[r_idx][0]:
|
||||||
|
y0_offset = abs(t.y0 - table.rows[r_idx][0])
|
||||||
|
if t.y1 < table.rows[r_idx][1]:
|
||||||
|
y1_offset = abs(t.y1 - table.rows[r_idx][1])
|
||||||
|
if t.x0 < table.cols[c_idx][0]:
|
||||||
|
x0_offset = abs(t.x0 - table.cols[c_idx][0])
|
||||||
|
if t.x1 > table.cols[c_idx][1]:
|
||||||
|
x1_offset = abs(t.x1 - table.cols[c_idx][1])
|
||||||
|
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||||
|
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||||
|
charea = X * Y
|
||||||
|
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||||
|
|
||||||
|
if split_text:
|
||||||
|
return split_textline(table, t, direction), error
|
||||||
|
else:
|
||||||
|
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
|
||||||
|
|
||||||
|
|
||||||
def get_score(error_weights):
|
def get_score(error_weights):
|
||||||
|
|
@ -448,9 +693,14 @@ def count_empty(d):
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
n_empty_rows : number of empty rows
|
n_empty_rows : list
|
||||||
n_empty_cols : number of empty columns
|
Number of empty rows.
|
||||||
empty_p : percentage of empty cells
|
|
||||||
|
n_empty_cols : list
|
||||||
|
Number of empty columns.
|
||||||
|
|
||||||
|
empty_p : float
|
||||||
|
Percentage of empty cells.
|
||||||
"""
|
"""
|
||||||
empty_p = 0
|
empty_p = 0
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
r_nempty_cells, c_nempty_cells = [], []
|
||||||
|
|
@ -491,7 +741,7 @@ def encode_list(ar):
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
def get_text_objects(layout, LTType="char", t=None):
|
def get_text_objects(layout, ltype="char", t=None):
|
||||||
"""Recursively parses pdf layout to get a list of
|
"""Recursively parses pdf layout to get a list of
|
||||||
text objects.
|
text objects.
|
||||||
|
|
||||||
|
|
@ -500,7 +750,7 @@ def get_text_objects(layout, LTType="char", t=None):
|
||||||
layout : object
|
layout : object
|
||||||
PDFMiner LTPage object.
|
PDFMiner LTPage object.
|
||||||
|
|
||||||
LTType : string
|
ltype : string
|
||||||
{'char', 'lh', 'lv'}
|
{'char', 'lh', 'lv'}
|
||||||
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
||||||
and LTTextLineVertical objects respectively.
|
and LTTextLineVertical objects respectively.
|
||||||
|
|
@ -512,11 +762,11 @@ def get_text_objects(layout, LTType="char", t=None):
|
||||||
t : list
|
t : list
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
"""
|
"""
|
||||||
if LTType == "char":
|
if ltype == "char":
|
||||||
LTObject = LTChar
|
LTObject = LTChar
|
||||||
elif LTType == "lh":
|
elif ltype == "lh":
|
||||||
LTObject = LTTextLineHorizontal
|
LTObject = LTTextLineHorizontal
|
||||||
elif LTType == "lv":
|
elif ltype == "lv":
|
||||||
LTObject = LTTextLineVertical
|
LTObject = LTTextLineVertical
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
|
|
@ -525,7 +775,7 @@ def get_text_objects(layout, LTType="char", t=None):
|
||||||
if isinstance(obj, LTObject):
|
if isinstance(obj, LTObject):
|
||||||
t.append(obj)
|
t.append(obj)
|
||||||
else:
|
else:
|
||||||
t += get_text_objects(obj, LTType=LTType)
|
t += get_text_objects(obj, ltype=ltype)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ def test_lattice_basic():
|
||||||
def test_lattice_fill():
|
def test_lattice_fill():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["Plan Type","County","Plan Name","Totals"],
|
["Plan Type","County","Plan Name","Totals"],
|
||||||
["GMC","Sacramento","Anthem Blue Cross","164,380"],
|
["GMC","Sacramento","Anthem Blue Cross","164,380"],
|
||||||
["GMC","Sacramento","Health Net","126,547"],
|
["GMC","Sacramento","Health Net","126,547"],
|
||||||
["GMC","Sacramento","Kaiser Foundation","74,620"],
|
["GMC","Sacramento","Kaiser Foundation","74,620"],
|
||||||
|
|
@ -122,46 +122,4 @@ def test_lattice_table_rotation():
|
||||||
pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
|
pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
|
||||||
manager = Pdf(Lattice(), pdfname, clean=True)
|
manager = Pdf(Lattice(), pdfname, clean=True)
|
||||||
tables = manager.extract()
|
tables = manager.extract()
|
||||||
assert_equal(tables['page-1']['table-1']['data'], data)
|
|
||||||
|
|
||||||
def test_lattice_cell_rotation():
|
|
||||||
|
|
||||||
data = [
|
|
||||||
["Sl.No.","District","Projected Population for 2012-13(In lakhs)","Adult Equivalent to 88%(In lakhs)","Total Consumptionrequirement(@ 400gms/adult/day)(In Lakh tonnes)","Total Requirement(Including seeds, feeds & wastage)(In Lakh tonnes)","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)",""],
|
|
||||||
["","","","","","","Kharif","Rabi","Total","Rice","Paddy"],
|
|
||||||
["1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25"],
|
|
||||||
["2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94"],
|
|
||||||
["3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72"],
|
|
||||||
["4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91"],
|
|
||||||
["5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15"],
|
|
||||||
["6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64"],
|
|
||||||
["7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84"],
|
|
||||||
["8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78"],
|
|
||||||
["9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75"],
|
|
||||||
["10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22"],
|
|
||||||
["11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03"],
|
|
||||||
["12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25"],
|
|
||||||
["13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24"],
|
|
||||||
["14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73"],
|
|
||||||
["15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19"],
|
|
||||||
["16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52"],
|
|
||||||
["17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33"],
|
|
||||||
["18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15"],
|
|
||||||
["19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39"],
|
|
||||||
["20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79"],
|
|
||||||
["21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58"],
|
|
||||||
["22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64"],
|
|
||||||
["23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34"],
|
|
||||||
["24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00"],
|
|
||||||
["25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99"],
|
|
||||||
["26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87"],
|
|
||||||
["27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91"],
|
|
||||||
["28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07"],
|
|
||||||
["29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19"],
|
|
||||||
["30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43"],
|
|
||||||
["ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92"]
|
|
||||||
]
|
|
||||||
pdfname = os.path.join(testdir, 'agstat.pdf')
|
|
||||||
manager = Pdf(Lattice(), pdfname, clean=True)
|
|
||||||
tables = manager.extract()
|
|
||||||
assert_equal(tables['page-1']['table-1']['data'], data)
|
assert_equal(tables['page-1']['table-1']['data'], data)
|
||||||
|
|
@ -169,45 +169,45 @@ def test_stream_columns():
|
||||||
def test_stream_table_rotation():
|
def test_stream_table_rotation():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","","",""],
|
["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
|
||||||
["","","","","","Modern method","","","","","","","Traditional method","","","",""],
|
["","","","","","","Modern method","","","","","","","Traditional method","","","",""],
|
||||||
["","","Any","","","","","","","Other","Any","","","","Not","","Number"],
|
["","","","Any","","","","","","","Other","Any","","","","Not","","Number"],
|
||||||
["","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
|
["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
|
||||||
["Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
|
["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
|
||||||
["Caste/tribe","","","","","","","","","","","","","","","",""],
|
["","Caste/tribe","","","","","","","","","","","","","","","",""],
|
||||||
["Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
|
["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
|
||||||
["Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
|
["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
|
||||||
["Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
|
["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
|
||||||
["Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
|
["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
|
||||||
["Wealth index","","","","","","","","","","","","","","","",""],
|
["","Wealth index","","","","","","","","","","","","","","","",""],
|
||||||
["Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
|
["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
|
||||||
["Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
|
["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
|
||||||
["Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
|
["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
|
||||||
["Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
|
["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
|
||||||
["Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
|
["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
|
||||||
["Number of living children","","","","","","","","","","","","","","","",""],
|
["","Number of living children","","","","","","","","","","","","","","","",""],
|
||||||
["No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
|
["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
|
||||||
["1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
|
["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
|
||||||
["1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
|
["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
|
||||||
["No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
|
["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
|
||||||
["2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
|
["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
|
||||||
["1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
|
["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
|
||||||
["No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
|
["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
|
||||||
["3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
|
["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
|
||||||
["1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
|
["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
|
||||||
["No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
|
["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
|
||||||
["4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
|
["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
|
||||||
["1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
|
["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
|
||||||
["No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
|
["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
|
||||||
["Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
|
["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
|
||||||
["NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
|
["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
|
||||||
["NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
|
["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
|
||||||
["","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
|
["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
|
||||||
["not shown separately.","","","","","","","","","","","","","","","",""],
|
["","not shown separately.","","","","","","","","","","","","","","","",""],
|
||||||
["na = Not available","","","","","","","","","","","","","","","",""],
|
["","na = Not available","","","","","","","","","","","","","","","",""],
|
||||||
["","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
|
["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
|
||||||
["( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
|
["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
|
||||||
["","","","","","","","54","","","","","","","","",""]
|
["","","","","","","","","54","","","","","","","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
|
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
|
||||||
manager = Pdf(Stream(), pdfname, clean=True)
|
manager = Pdf(Stream(), pdfname, clean=True)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue