Merge ba1604ee40 into 705473198f
commit
dcae630351
|
|
@ -237,6 +237,13 @@ def lattice(c, *args, **kwargs):
|
|||
multiple=True,
|
||||
help="X coordinates of column separators.",
|
||||
)
|
||||
@click.option(
|
||||
"-R",
|
||||
"--rows",
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Y coordinates of rows separators.",
|
||||
)
|
||||
@click.option(
|
||||
"-e",
|
||||
"--edge_tol",
|
||||
|
|
@ -282,6 +289,8 @@ def stream(c, *args, **kwargs):
|
|||
kwargs["table_areas"] = None if not table_areas else table_areas
|
||||
columns = list(kwargs["columns"])
|
||||
kwargs["columns"] = None if not columns else columns
|
||||
rows = list(kwargs["rows"])
|
||||
kwargs["rows"] = None if not rows else rows
|
||||
|
||||
if plot_type is not None:
|
||||
if not _HAS_MPL:
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ def read_pdf(
|
|||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
rows^ : list, optional (default: None)
|
||||
List of rows y-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ class Stream(BaseParser):
|
|||
"""Stream method of parsing looks for spaces between text
|
||||
to parse the table.
|
||||
|
||||
If you want to specify columns when specifying multiple table
|
||||
If you want to specify rows or columns when specifying multiple table
|
||||
areas, make sure that the length of both lists are equal.
|
||||
|
||||
Parameters
|
||||
|
|
@ -51,6 +51,9 @@ class Stream(BaseParser):
|
|||
column_tol : int, optional (default: 0)
|
||||
Tolerance parameter used to combine text horizontally,
|
||||
to generate columns.
|
||||
rows : list, optional (default: None)
|
||||
List of rows y-coordinates strings where the coordinates
|
||||
are comma-separated
|
||||
|
||||
"""
|
||||
|
||||
|
|
@ -65,6 +68,7 @@ class Stream(BaseParser):
|
|||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
rows=None,
|
||||
**kwargs
|
||||
):
|
||||
self.table_regions = table_regions
|
||||
|
|
@ -77,6 +81,7 @@ class Stream(BaseParser):
|
|||
self.edge_tol = edge_tol
|
||||
self.row_tol = row_tol
|
||||
self.column_tol = column_tol
|
||||
self.rows = rows
|
||||
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
|
|
@ -283,7 +288,7 @@ class Stream(BaseParser):
|
|||
# guess table areas using textlines and relevant edges
|
||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||
# treat whole page as table area if no table areas found
|
||||
if not len(table_bbox):
|
||||
if not table_bbox:
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
|
||||
return table_bbox
|
||||
|
|
@ -329,6 +334,13 @@ class Stream(BaseParser):
|
|||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
||||
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
if self.rows is not None and self.rows[table_idx] != "":
|
||||
rows = self.rows[table_idx].split(",")
|
||||
rows = [float(c) for c in rows]
|
||||
rows.insert(0, text_y_max)
|
||||
rows.append(text_y_min)
|
||||
rows = [(rows[i], rows[i+1]) for i in range(0, len(rows) - 1)]
|
||||
else:
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
|
|
@ -353,7 +365,7 @@ class Stream(BaseParser):
|
|||
# see if the list contains elements, if yes, then use
|
||||
# the mode after removing 1s
|
||||
elements = list(filter(lambda x: x != 1, elements))
|
||||
if len(elements):
|
||||
if elements:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ def download_url(url):
|
|||
return filepath
|
||||
|
||||
|
||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol", "rows"]
|
||||
lattice_kwargs = [
|
||||
"process_background",
|
||||
"line_scale",
|
||||
|
|
|
|||
Loading…
Reference in New Issue