Add headers param

pull/2/head
Vinayak Mehta 2016-10-12 13:59:10 +05:30 committed by GitHub
parent a43d5ca2c7
commit e8b93a9624
2 changed files with 45 additions and 8 deletions

View File

@ -124,7 +124,7 @@ class Lattice:
Parameters Parameters
---------- ----------
table_area : list table_area : list
List of tuples of the form (x1, y1, x2, y2) where List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze. coordinate space, denoting table areas to analyze.
(optional, default: None) (optional, default: None)
@ -135,6 +135,10 @@ class Lattice:
or both directions. or both directions.
(optional, default: None) (optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
mtol : list mtol : list
List of ints specifying m-tolerance parameters. List of ints specifying m-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
@ -170,13 +174,14 @@ class Lattice:
of detected contours, lines, joints and the table generated. of detected contours, lines, joints and the table generated.
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, fill=None, mtol=[2], scale=15, def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
invert=False, margins=(1.0, 0.5, 0.1), split_text=False, scale=15, invert=False, margins=(1.0, 0.5, 0.1),
shift_text=['l', 't'], debug=None): split_text=False, shift_text=['l', 't'], debug=None):
self.method = 'lattice' self.method = 'lattice'
self.table_area = table_area self.table_area = table_area
self.fill = fill self.fill = fill
self.headers = [h.split(',') for h in headers]
self.mtol = mtol self.mtol = mtol
self.scale = scale self.scale = scale
self.invert = invert self.invert = invert
@ -240,6 +245,10 @@ class Lattice:
if self.fill is not None: if self.fill is not None:
if len(self.table_area) != len(self.fill): if len(self.table_area) != len(self.fill):
raise ValueError("Length of fill should be equal to table_area.") raise ValueError("Length of fill should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
areas = [] areas = []
for area in self.table_area: for area in self.table_area:
x1, y1, x2, y2 = area.split(",") x1, y1, x2, y2 = area.split(",")
@ -297,6 +306,14 @@ class Lattice:
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)] for i in range(0, len(rows) - 1)]
if self.headers is not None and len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
rows, cols = rotate_table(rows, cols, table_rotation) rows, cols = rotate_table(rows, cols, table_rotation)
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
@ -326,6 +343,8 @@ class Lattice:
if self.fill is not None: if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no]) table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list() ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar) ar = encode_list(ar)
table_data['data'] = ar table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -224,7 +224,7 @@ class Stream:
Parameters Parameters
---------- ----------
table_area : list table_area : list
List of tuples of the form (x1, y1, x2, y2) where List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze. coordinate space, denoting table areas to analyze.
(optional, default: None) (optional, default: None)
@ -238,6 +238,10 @@ class Stream:
List of ints specifying the number of columns in each table. List of ints specifying the number of columns in each table.
(optional, default: None) (optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
ytol : list ytol : list
List of ints specifying the y-tolerance parameters. List of ints specifying the y-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
@ -260,9 +264,9 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns. LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False) (optional, default: False)
""" """
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], def __init__(self, table_area=None, columns=None, ncolumns=None,
mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False, headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
debug=False): split_text=False, debug=False):
self.method = 'stream' self.method = 'stream'
self.table_area = table_area self.table_area = table_area
@ -270,6 +274,7 @@ class Stream:
self.ncolumns = ncolumns self.ncolumns = ncolumns
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
self.headers = [h.split(',') for h in headers]
self.char_margin, self.line_margin, self.word_margin = margins self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text self.split_text = split_text
self.debug = debug self.debug = debug
@ -310,6 +315,10 @@ class Stream:
if self.ncolumns is not None: if self.ncolumns is not None:
if len(self.table_area) != len(self.ncolumns): if len(self.table_area) != len(self.ncolumns):
raise ValueError("Length of ncolumns should be equal to table_area.") raise ValueError("Length of ncolumns should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
table_bbox = {} table_bbox = {}
for area in self.table_area: for area in self.table_area:
x1, y1, x2, y2 = area.split(",") x1, y1, x2, y2 = area.split(",")
@ -399,6 +408,13 @@ class Stream:
cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, text_x_min, text_x_max) cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows) table = Table(cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
assignment_errors = [] assignment_errors = []
@ -416,6 +432,8 @@ class Stream:
table_data['score'] = score table_data['score'] = score
ar = table.get_list() ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar) ar = encode_list(ar)
table_data['data'] = ar table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)