Deprecate Stream ncolumns

pull/2/head
Vinayak Mehta 2016-11-07 21:30:48 +05:30 committed by GitHub
parent 72c2a0020f
commit 10eda3f204
1 changed files with 31 additions and 51 deletions

View File

@ -211,14 +211,14 @@ def _add_columns(cols, text, ytol):
class Stream: class Stream:
"""Stream looks for spaces between text elements to form a table. """Stream looks for spaces between text elements to form a table.
If you want to give columns, ncolumns, ytol or mtol for each table If you want to give columns, ytol or mtol for each table
when specifying multiple table areas, make sure that their length when specifying multiple table areas, make sure that their length
is equal to the length of table_area. Mapping between them is based is equal to the length of table_area. Mapping between them is based
on index. on index.
Also, if you want to specify columns for the first table and If you don't want to specify columns for the some tables in a pdf
ncolumns for the second table in a pdf having two tables, pass page having multiple tables, pass them as empty strings.
columns as ['x1,x2,x3,x4', ''] and ncolumns as [-1, 5]. For example: ['', 'x1,x2,x3,x4', '']
Parameters Parameters
---------- ----------
@ -233,10 +233,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space. x-coordinates in PDFMiner's coordinate space.
(optional, default: None) (optional, default: None)
ncolumns : list
List of ints specifying the number of columns in each table.
(optional, default: None)
headers : list headers : list
List of strings where each string is a csv header for a table. List of strings where each string is a csv header for a table.
(optional, default: None) (optional, default: None)
@ -269,14 +265,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns. LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False) (optional, default: False)
""" """
def __init__(self, table_area=None, columns=None, ncolumns=None, def __init__(self, table_area=None, columns=None, headers=None,
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
split_text=False, flag_size=True, debug=False): split_text=False, flag_size=True, debug=False):
self.method = 'stream' self.method = 'stream'
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self.ncolumns = ncolumns
self.headers = headers self.headers = headers
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
@ -318,9 +313,6 @@ class Stream:
if self.columns is not None: if self.columns is not None:
if len(self.table_area) != len(self.columns): if len(self.table_area) != len(self.columns):
raise ValueError("Length of columns should be equal to table_area.") raise ValueError("Length of columns should be equal to table_area.")
if self.ncolumns is not None:
if len(self.table_area) != len(self.ncolumns):
raise ValueError("Length of ncolumns should be equal to table_area.")
if self.headers is not None: if self.headers is not None:
if len(self.table_area) != len(self.headers): if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.") raise ValueError("Length of headers should be equal to table_area.")
@ -372,43 +364,31 @@ class Stream:
cols.append(text_x_max) cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
if self.ncolumns is not None and self.ncolumns[table_no] != -1: guess = True
ncols = self.ncolumns[table_no] ncols = max(set(elements), key=elements.count)
cols = [(t.x0, t.x1) len_non_mode = len(filter(lambda x: x != ncols, elements))
for r in rows_grouped if len(r) == ncols for t in r] if ncols == 1 and not self.debug:
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) # no tables detected
if len(cols) != self.ncolumns[table_no]: logging.warning("{}: Only one column was detected, the pdf"
logging.warning("{}: The number of columns after merge" " may have no tables. Specify ncols if"
" isn't the same as what you specified." " the pdf has tables.".format(
" Change the value of mtol.".format( os.path.basename(bname)))
os.path.basename(bname))) cols = [(t.x0, t.x1)
cols = _join_columns(cols, text_x_min, text_x_max) for r in rows_grouped if len(r) == ncols for t in r]
else: cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
guess = True inner_text = []
ncols = max(set(elements), key=elements.count) for i in range(1, len(cols)):
len_non_mode = len(filter(lambda x: x != ncols, elements)) left = cols[i - 1][1]
if ncols == 1 and not self.debug: right = cols[i][0]
# no tables detected inner_text.extend([t for direction in t_bbox
logging.warning("{}: Only one column was detected, the pdf" for t in t_bbox[direction]
" may have no tables. Specify ncols if" if t.x0 > left and t.x1 < right])
" the pdf has tables.".format( outer_text = [t for direction in t_bbox
os.path.basename(bname))) for t in t_bbox[direction]
cols = [(t.x0, t.x1) if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
for r in rows_grouped if len(r) == ncols for t in r] inner_text.extend(outer_text)
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) cols = _add_columns(cols, inner_text, self.ytol[table_no])
inner_text = [] cols = _join_columns(cols, text_x_min, text_x_max)
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for direction in t_bbox
for t in t_bbox[direction]
if t.x0 > left and t.x1 < right])
outer_text = [t for direction in t_bbox
for t in t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and self.headers[table_no] != [""]: if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',') self.headers[table_no] = self.headers[table_no].split(',')