Deprecate Stream ncolumns

pull/2/head
Vinayak Mehta 2016-11-07 21:30:48 +05:30 committed by GitHub
parent 72c2a0020f
commit 10eda3f204
1 changed files with 31 additions and 51 deletions

View File

@ -211,14 +211,14 @@ def _add_columns(cols, text, ytol):
class Stream:
"""Stream looks for spaces between text elements to form a table.
If you want to give columns, ncolumns, ytol or mtol for each table
If you want to give columns, ytol or mtol for each table
when specifying multiple table areas, make sure that their length
is equal to the length of table_area. Mapping between them is based
on index.
Also, if you want to specify columns for the first table and
ncolumns for the second table in a pdf having two tables, pass
columns as ['x1,x2,x3,x4', ''] and ncolumns as [-1, 5].
If you don't want to specify columns for the some tables in a pdf
page having multiple tables, pass them as empty strings.
For example: ['', 'x1,x2,x3,x4', '']
Parameters
----------
@ -233,10 +233,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space.
(optional, default: None)
ncolumns : list
List of ints specifying the number of columns in each table.
(optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
@ -269,14 +265,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
def __init__(self, table_area=None, columns=None, ncolumns=None,
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
def __init__(self, table_area=None, columns=None, headers=None,
ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
split_text=False, flag_size=True, debug=False):
self.method = 'stream'
self.table_area = table_area
self.columns = columns
self.ncolumns = ncolumns
self.headers = headers
self.ytol = ytol
self.mtol = mtol
@ -318,9 +313,6 @@ class Stream:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of columns should be equal to table_area.")
if self.ncolumns is not None:
if len(self.table_area) != len(self.ncolumns):
raise ValueError("Length of ncolumns should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
@ -371,18 +363,6 @@ class Stream:
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
if self.ncolumns is not None and self.ncolumns[table_no] != -1:
ncols = self.ncolumns[table_no]
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
if len(cols) != self.ncolumns[table_no]:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, text_x_min, text_x_max)
else:
guess = True
ncols = max(set(elements), key=elements.count)