Add headers param

pull/2/head
Vinayak Mehta 2016-10-12 13:59:10 +05:30 committed by GitHub
parent a43d5ca2c7
commit e8b93a9624
2 changed files with 45 additions and 8 deletions

View File

@ -124,7 +124,7 @@ class Lattice:
Parameters
----------
table_area : list
List of tuples of the form (x1, y1, x2, y2) where
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze.
(optional, default: None)
@ -135,6 +135,10 @@ class Lattice:
or both directions.
(optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
@ -170,13 +174,14 @@ class Lattice:
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
shift_text=['l', 't'], debug=None):
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
scale=15, invert=False, margins=(1.0, 0.5, 0.1),
split_text=False, shift_text=['l', 't'], debug=None):
self.method = 'lattice'
self.table_area = table_area
self.fill = fill
self.headers = [h.split(',') for h in headers]
self.mtol = mtol
self.scale = scale
self.invert = invert
@ -240,6 +245,10 @@ class Lattice:
if self.fill is not None:
if len(self.table_area) != len(self.fill):
raise ValueError("Length of fill should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
areas = []
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
@ -297,6 +306,14 @@ class Lattice:
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
if self.headers is not None and len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
rows, cols = rotate_table(rows, cols, table_rotation)
table = Table(cols, rows)
# set table edges to True using ver+hor lines
@ -326,6 +343,8 @@ class Lattice:
if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -224,7 +224,7 @@ class Stream:
Parameters
----------
table_area : list
List of tuples of the form (x1, y1, x2, y2) where
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze.
(optional, default: None)
@ -238,6 +238,10 @@ class Stream:
List of ints specifying the number of columns in each table.
(optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
ytol : list
List of ints specifying the y-tolerance parameters.
(optional, default: [2])
@ -260,9 +264,9 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False,
debug=False):
def __init__(self, table_area=None, columns=None, ncolumns=None,
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
split_text=False, debug=False):
self.method = 'stream'
self.table_area = table_area
@ -270,6 +274,7 @@ class Stream:
self.ncolumns = ncolumns
self.ytol = ytol
self.mtol = mtol
self.headers = [h.split(',') for h in headers]
self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text
self.debug = debug
@ -310,6 +315,10 @@ class Stream:
if self.ncolumns is not None:
if len(self.table_area) != len(self.ncolumns):
raise ValueError("Length of ncolumns should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
table_bbox = {}
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
@ -399,6 +408,13 @@ class Stream:
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows)
table = table.set_all_edges()
assignment_errors = []
@ -416,6 +432,8 @@ class Stream:
table_data['score'] = score
ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)