Add headers param
parent
a43d5ca2c7
commit
e8b93a9624
|
|
@ -124,7 +124,7 @@ class Lattice:
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list
|
table_area : list
|
||||||
List of tuples of the form (x1, y1, x2, y2) where
|
List of strings of the form x1,y1,x2,y2 where
|
||||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
||||||
coordinate space, denoting table areas to analyze.
|
coordinate space, denoting table areas to analyze.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
@ -135,6 +135,10 @@ class Lattice:
|
||||||
or both directions.
|
or both directions.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
||||||
|
headers : list
|
||||||
|
List of strings where each string is a csv header for a table.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
mtol : list
|
mtol : list
|
||||||
List of ints specifying m-tolerance parameters.
|
List of ints specifying m-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
@ -170,13 +174,14 @@ class Lattice:
|
||||||
of detected contours, lines, joints and the table generated.
|
of detected contours, lines, joints and the table generated.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15,
|
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
||||||
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
|
scale=15, invert=False, margins=(1.0, 0.5, 0.1),
|
||||||
shift_text=['l', 't'], debug=None):
|
split_text=False, shift_text=['l', 't'], debug=None):
|
||||||
|
|
||||||
self.method = 'lattice'
|
self.method = 'lattice'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
|
self.headers = [h.split(',') for h in headers]
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
|
|
@ -240,6 +245,10 @@ class Lattice:
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
if len(self.table_area) != len(self.fill):
|
if len(self.table_area) != len(self.fill):
|
||||||
raise ValueError("Length of fill should be equal to table_area.")
|
raise ValueError("Length of fill should be equal to table_area.")
|
||||||
|
if self.headers is not None:
|
||||||
|
if len(self.table_area) != len(self.headers):
|
||||||
|
raise ValueError("Length of headers should be equal to table_area.")
|
||||||
|
|
||||||
areas = []
|
areas = []
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
|
@ -297,6 +306,14 @@ class Lattice:
|
||||||
for i in range(0, len(cols) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
rows = [(rows[i], rows[i + 1])
|
rows = [(rows[i], rows[i + 1])
|
||||||
for i in range(0, len(rows) - 1)]
|
for i in range(0, len(rows) - 1)]
|
||||||
|
|
||||||
|
if self.headers is not None and len(self.headers[table_no]) != len(cols):
|
||||||
|
logging.warning("Length of header ({0}) specified for table is not"
|
||||||
|
" equal to the number of columns ({1}) detected.".format(
|
||||||
|
len(self.headers[table_no]), len(cols)))
|
||||||
|
while len(self.headers[table_no]) != len(cols):
|
||||||
|
self.headers[table_no].append('')
|
||||||
|
|
||||||
rows, cols = rotate_table(rows, cols, table_rotation)
|
rows, cols = rotate_table(rows, cols, table_rotation)
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
|
|
@ -326,6 +343,8 @@ class Lattice:
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
table = _fill_spanning(table, fill=self.fill[table_no])
|
table = _fill_spanning(table, fill=self.fill[table_no])
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
|
if self.headers is not None and self.headers[table_no] != ['']:
|
||||||
|
ar.insert(0, self.headers[table_no])
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
|
|
||||||
|
|
@ -224,7 +224,7 @@ class Stream:
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list
|
table_area : list
|
||||||
List of tuples of the form (x1, y1, x2, y2) where
|
List of strings of the form x1,y1,x2,y2 where
|
||||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
||||||
coordinate space, denoting table areas to analyze.
|
coordinate space, denoting table areas to analyze.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
@ -238,6 +238,10 @@ class Stream:
|
||||||
List of ints specifying the number of columns in each table.
|
List of ints specifying the number of columns in each table.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
||||||
|
headers : list
|
||||||
|
List of strings where each string is a csv header for a table.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
ytol : list
|
ytol : list
|
||||||
List of ints specifying the y-tolerance parameters.
|
List of ints specifying the y-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
@ -260,9 +264,9 @@ class Stream:
|
||||||
LTTextLineHorizontals in order to select table_area, columns.
|
LTTextLineHorizontals in order to select table_area, columns.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
|
def __init__(self, table_area=None, columns=None, ncolumns=None,
|
||||||
mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False,
|
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
|
||||||
debug=False):
|
split_text=False, debug=False):
|
||||||
|
|
||||||
self.method = 'stream'
|
self.method = 'stream'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
|
|
@ -270,6 +274,7 @@ class Stream:
|
||||||
self.ncolumns = ncolumns
|
self.ncolumns = ncolumns
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
|
self.headers = [h.split(',') for h in headers]
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
@ -310,6 +315,10 @@ class Stream:
|
||||||
if self.ncolumns is not None:
|
if self.ncolumns is not None:
|
||||||
if len(self.table_area) != len(self.ncolumns):
|
if len(self.table_area) != len(self.ncolumns):
|
||||||
raise ValueError("Length of ncolumns should be equal to table_area.")
|
raise ValueError("Length of ncolumns should be equal to table_area.")
|
||||||
|
if self.headers is not None:
|
||||||
|
if len(self.table_area) != len(self.headers):
|
||||||
|
raise ValueError("Length of headers should be equal to table_area.")
|
||||||
|
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
|
@ -399,6 +408,13 @@ class Stream:
|
||||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
|
if self.headers is not None and len(self.headers[table_no]) != len(cols):
|
||||||
|
logging.warning("Length of header ({0}) specified for table is not"
|
||||||
|
" equal to the number of columns ({1}) detected.".format(
|
||||||
|
len(self.headers[table_no]), len(cols)))
|
||||||
|
while len(self.headers[table_no]) != len(cols):
|
||||||
|
self.headers[table_no].append('')
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
|
|
@ -416,6 +432,8 @@ class Stream:
|
||||||
|
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
|
if self.headers is not None and self.headers[table_no] != ['']:
|
||||||
|
ar.insert(0, self.headers[table_no])
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue