Fix docstrings and interlinks
parent
3713c08642
commit
17ea5f335e
|
|
@ -46,7 +46,6 @@ class Cell(object):
|
||||||
Whether or not cell spans vertically.
|
Whether or not cell spans vertically.
|
||||||
text : string
|
text : string
|
||||||
Text assigned to cell.
|
Text assigned to cell.
|
||||||
bound
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -101,8 +100,7 @@ class Table(object):
|
||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
----------
|
----------
|
||||||
df : object
|
df : :class:`pandas.DataFrame`
|
||||||
pandas.DataFrame
|
|
||||||
shape : tuple
|
shape : tuple
|
||||||
Shape of the table.
|
Shape of the table.
|
||||||
accuracy : float
|
accuracy : float
|
||||||
|
|
@ -113,8 +111,6 @@ class Table(object):
|
||||||
Table number on pdf page.
|
Table number on pdf page.
|
||||||
page : int
|
page : int
|
||||||
Pdf page number.
|
Pdf page number.
|
||||||
data
|
|
||||||
parsing_report
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
|
|
@ -143,7 +139,7 @@ class Table(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parsing_report(self):
|
def parsing_report(self):
|
||||||
"""Returns a parsing report with accuracy, %whitespace,
|
"""Returns a parsing report with %accuracy, %whitespace,
|
||||||
table number on page and page number.
|
table number on page and page number.
|
||||||
"""
|
"""
|
||||||
# pretty?
|
# pretty?
|
||||||
|
|
@ -320,10 +316,15 @@ class Table(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_csv(self, path, **kwargs):
|
def to_csv(self, path, **kwargs):
|
||||||
"""Write Table to a comma-separated values (csv) file.
|
"""Writes Table to a comma-separated values (csv) file.
|
||||||
|
|
||||||
|
For kwargs, check :meth:`pandas.DataFrame.to_csv`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Output filepath.
|
||||||
|
|
||||||
Check `pandas.DataFrame.to_csv <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html>`_
|
|
||||||
kwargs for more details around what kwargs to use.
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'encoding': 'utf-8',
|
'encoding': 'utf-8',
|
||||||
|
|
@ -334,10 +335,15 @@ class Table(object):
|
||||||
self.df.to_csv(path, **kw)
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
def to_json(self, path, **kwargs):
|
def to_json(self, path, **kwargs):
|
||||||
"""Write Table to a JSON file.
|
"""Writes Table to a JSON file.
|
||||||
|
|
||||||
|
For kwargs, check :meth:`pandas.DataFrame.to_json`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Output filepath.
|
||||||
|
|
||||||
Check `pandas.DataFrame.to_json <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html>`_
|
|
||||||
kwargs for more details around what kwargs to use.
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'orient': 'records'
|
'orient': 'records'
|
||||||
|
|
@ -348,10 +354,15 @@ class Table(object):
|
||||||
f.write(json_string)
|
f.write(json_string)
|
||||||
|
|
||||||
def to_excel(self, path, **kwargs):
|
def to_excel(self, path, **kwargs):
|
||||||
"""Write Table to an Excel file.
|
"""Writes Table to an Excel file.
|
||||||
|
|
||||||
|
For kwargs, check :meth:`pandas.DataFrame.to_excel`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Output filepath.
|
||||||
|
|
||||||
Check `pandas.DataFrame.to_excel <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html>`_
|
|
||||||
kwargs for more details around what kwargs to use.
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||||
|
|
@ -363,10 +374,15 @@ class Table(object):
|
||||||
writer.save()
|
writer.save()
|
||||||
|
|
||||||
def to_html(self, path, **kwargs):
|
def to_html(self, path, **kwargs):
|
||||||
"""Write Table to an HTML file.
|
"""Writes Table to an HTML file.
|
||||||
|
|
||||||
|
For kwargs, check :meth:`pandas.DataFrame.to_html`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Output filepath.
|
||||||
|
|
||||||
Check `pandas.DataFrame.to_html <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_html.html>`_
|
|
||||||
kwargs for more details around what kwargs to use.
|
|
||||||
"""
|
"""
|
||||||
html_string = self.df.to_html(**kwargs)
|
html_string = self.df.to_html(**kwargs)
|
||||||
with open(path, 'w') as f:
|
with open(path, 'w') as f:
|
||||||
|
|
@ -434,7 +450,7 @@ class TableList(object):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
path : str
|
path : str
|
||||||
Filepath
|
Output filepath.
|
||||||
f : str
|
f : str
|
||||||
File format. Can be csv, json, excel and html.
|
File format. Can be csv, json, excel and html.
|
||||||
compress : bool
|
compress : bool
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,8 @@ class PDFHandler(object):
|
||||||
file into single page pdfs, parsing each pdf and then removing the
|
file into single page pdfs, parsing each pdf and then removing the
|
||||||
temp directory.
|
temp directory.
|
||||||
|
|
||||||
Parameter
|
Parameters
|
||||||
---------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
multiple cells.
|
multiple cells.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts)
|
||||||
row_close_tol^ : int, optional (default: 2)
|
row_close_tol^ : int, optional (default: 2)
|
||||||
Rows will be formed by combining text vertically
|
Rows will be formed by combining text vertically
|
||||||
within this tolerance.
|
within this tolerance.
|
||||||
|
|
@ -61,24 +61,24 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
joint_close_tol* : int, optional (default: 2)
|
joint_close_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
threshold_blocksize : int, optional (default: 15)
|
threshold_blocksize* : int, optional (default: 15)
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
threshold_constant : int, optional (default: -2)
|
threshold_constant* : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
iterations : int, optional (default: 0)
|
iterations* : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
margins : tuple
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
|
||||||
|
|
@ -51,8 +51,8 @@ class Lattice(BaseParser):
|
||||||
multiple cells.
|
multiple cells.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts)
|
||||||
line_close_tol : int, optional (default: 2)
|
line_close_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to merge vertical and horizontal
|
Tolerance parameter used to merge vertical and horizontal
|
||||||
detected lines which lie close to each other.
|
detected lines which lie close to each other.
|
||||||
|
|
@ -76,7 +76,7 @@ class Lattice(BaseParser):
|
||||||
margins : tuple
|
margins : tuple
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
debug : bool, optional (default: False)
|
||||||
Whether or not to return all text objects on the page
|
Whether or not to return all text objects on the page
|
||||||
which can be used to generate a matplotlib plot, to get
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
|
|
||||||
|
|
@ -35,8 +35,8 @@ class Stream(BaseParser):
|
||||||
multiple cells.
|
multiple cells.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts)
|
||||||
row_close_tol : int, optional (default: 2)
|
row_close_tol : int, optional (default: 2)
|
||||||
Rows will be formed by combining text vertically
|
Rows will be formed by combining text vertically
|
||||||
within this tolerance.
|
within this tolerance.
|
||||||
|
|
@ -46,7 +46,7 @@ class Stream(BaseParser):
|
||||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
debug : bool, optional (default: False)
|
||||||
Whether or not to return all text objects on the page
|
Whether or not to return all text objects on the page
|
||||||
which can be used to generate a matplotlib plot, to get
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
|
|
||||||
|
|
@ -25,12 +25,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
is used by default.
|
is used by default.
|
||||||
geometry_type : str, optional (default: None)
|
geometry_type : str, optional (default: None)
|
||||||
'text' : Plot text objects found on page, useful to get
|
* 'text' : Plot text objects found on page. (Useful to get \
|
||||||
table_area and columns coordinates.
|
table_area and columns coordinates)
|
||||||
'table' : Plot parsed table.
|
* 'table' : Plot parsed table.
|
||||||
'contour'* : Plot detected rectangles.
|
* 'contour'* : Plot detected rectangles.
|
||||||
'joint'* : Plot detected line intersections.
|
* 'joint'* : Plot detected line intersections.
|
||||||
'line'* : Plot detected lines.
|
* 'line'* : Plot detected lines.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to process as strings of the form
|
List of table areas to process as strings of the form
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
|
@ -43,8 +43,8 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
||||||
multiple cells.
|
multiple cells.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts.)
|
||||||
row_close_tol^ : int, optional (default: 2)
|
row_close_tol^ : int, optional (default: 2)
|
||||||
Rows will be formed by combining text vertically
|
Rows will be formed by combining text vertically
|
||||||
within this tolerance.
|
within this tolerance.
|
||||||
|
|
@ -74,24 +74,24 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
||||||
joint_close_tol* : int, optional (default: 2)
|
joint_close_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
threshold_blocksize : int, optional (default: 15)
|
threshold_blocksize* : int, optional (default: 15)
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
threshold_constant : int, optional (default: -2)
|
threshold_constant* : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
iterations : int, optional (default: 0)
|
iterations* : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
margins : tuple
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
||||||
|
|
|
||||||
|
|
@ -454,8 +454,8 @@ def split_textline(table, textline, direction, flag_size=False):
|
||||||
Direction of the PDFMiner LTTextLine object.
|
Direction of the PDFMiner LTTextLine object.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts.)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -530,8 +530,8 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
multiple cells.
|
multiple cells.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string, useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.
|
super and subscripts)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue