Deprecate Stream ncolumns

pull/2/head
Vinayak Mehta 2016-11-07 21:30:48 +05:30 committed by GitHub
parent 72c2a0020f
commit 10eda3f204
1 changed files with 31 additions and 51 deletions

View File

@ -211,14 +211,14 @@ def _add_columns(cols, text, ytol):
class Stream: class Stream:
"""Stream looks for spaces between text elements to form a table. """Stream looks for spaces between text elements to form a table.
If you want to give columns, ncolumns, ytol or mtol for each table If you want to give columns, ytol or mtol for each table
when specifying multiple table areas, make sure that their length when specifying multiple table areas, make sure that their length
is equal to the length of table_area. Mapping between them is based is equal to the length of table_area. Mapping between them is based
on index. on index.
Also, if you want to specify columns for the first table and If you don't want to specify columns for the some tables in a pdf
ncolumns for the second table in a pdf having two tables, pass page having multiple tables, pass them as empty strings.
columns as ['x1,x2,x3,x4', ''] and ncolumns as [-1, 5]. For example: ['', 'x1,x2,x3,x4', '']
Parameters Parameters
---------- ----------
@ -233,10 +233,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space. x-coordinates in PDFMiner's coordinate space.
(optional, default: None) (optional, default: None)
ncolumns : list
List of ints specifying the number of columns in each table.
(optional, default: None)
headers : list headers : list
List of strings where each string is a csv header for a table. List of strings where each string is a csv header for a table.
(optional, default: None) (optional, default: None)
@ -269,14 +265,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns. LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False) (optional, default: False)
""" """
def __init__(self, table_area=None, columns=None, ncolumns=None, def __init__(self, table_area=None, columns=None, headers=None,
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
split_text=False, flag_size=True, debug=False): split_text=False, flag_size=True, debug=False):
self.method = 'stream' self.method = 'stream'
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self.ncolumns = ncolumns
self.headers = headers self.headers = headers
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
@ -318,9 +313,6 @@ class Stream:
if self.columns is not None: if self.columns is not None:
if len(self.table_area) != len(self.columns): if len(self.table_area) != len(self.columns):
raise ValueError("Length of columns should be equal to table_area.") raise ValueError("Length of columns should be equal to table_area.")
if self.ncolumns is not None:
if len(self.table_area) != len(self.ncolumns):
raise ValueError("Length of ncolumns should be equal to table_area.")
if self.headers is not None: if self.headers is not None:
if len(self.table_area) != len(self.headers): if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.") raise ValueError("Length of headers should be equal to table_area.")
@ -371,18 +363,6 @@ class Stream:
cols.insert(0, text_x_min) cols.insert(0, text_x_min)
cols.append(text_x_max) cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
if self.ncolumns is not None and self.ncolumns[table_no] != -1:
ncols = self.ncolumns[table_no]
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
if len(cols) != self.ncolumns[table_no]:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, text_x_min, text_x_max)
else: else:
guess = True guess = True
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)