Fix docstrings and interlinks
parent
72a22dbd06
commit
f72e83a1b3
|
|
@ -46,7 +46,6 @@ class Cell(object):
|
|||
Whether or not cell spans vertically.
|
||||
text : string
|
||||
Text assigned to cell.
|
||||
bound
|
||||
|
||||
"""
|
||||
|
||||
|
|
@ -101,8 +100,7 @@ class Table(object):
|
|||
|
||||
Attributes
|
||||
----------
|
||||
df : object
|
||||
pandas.DataFrame
|
||||
df : :class:`pandas.DataFrame`
|
||||
shape : tuple
|
||||
Shape of the table.
|
||||
accuracy : float
|
||||
|
|
@ -113,8 +111,6 @@ class Table(object):
|
|||
Table number on pdf page.
|
||||
page : int
|
||||
Pdf page number.
|
||||
data
|
||||
parsing_report
|
||||
|
||||
"""
|
||||
def __init__(self, cols, rows):
|
||||
|
|
@ -143,7 +139,7 @@ class Table(object):
|
|||
|
||||
@property
|
||||
def parsing_report(self):
|
||||
"""Returns a parsing report with accuracy, %whitespace,
|
||||
"""Returns a parsing report with %accuracy, %whitespace,
|
||||
table number on page and page number.
|
||||
"""
|
||||
# pretty?
|
||||
|
|
@ -320,10 +316,15 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def to_csv(self, path, **kwargs):
|
||||
"""Write Table to a comma-separated values (csv) file.
|
||||
"""Writes Table to a comma-separated values (csv) file.
|
||||
|
||||
For kwargs, check :meth:`pandas.DataFrame.to_csv`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Output filepath.
|
||||
|
||||
Check `pandas.DataFrame.to_csv <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html>`_
|
||||
kwargs for more details around what kwargs to use.
|
||||
"""
|
||||
kw = {
|
||||
'encoding': 'utf-8',
|
||||
|
|
@ -334,10 +335,15 @@ class Table(object):
|
|||
self.df.to_csv(path, **kw)
|
||||
|
||||
def to_json(self, path, **kwargs):
|
||||
"""Write Table to a JSON file.
|
||||
"""Writes Table to a JSON file.
|
||||
|
||||
For kwargs, check :meth:`pandas.DataFrame.to_json`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Output filepath.
|
||||
|
||||
Check `pandas.DataFrame.to_json <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html>`_
|
||||
kwargs for more details around what kwargs to use.
|
||||
"""
|
||||
kw = {
|
||||
'orient': 'records'
|
||||
|
|
@ -348,10 +354,15 @@ class Table(object):
|
|||
f.write(json_string)
|
||||
|
||||
def to_excel(self, path, **kwargs):
|
||||
"""Write Table to an Excel file.
|
||||
"""Writes Table to an Excel file.
|
||||
|
||||
For kwargs, check :meth:`pandas.DataFrame.to_excel`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Output filepath.
|
||||
|
||||
Check `pandas.DataFrame.to_excel <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html>`_
|
||||
kwargs for more details around what kwargs to use.
|
||||
"""
|
||||
kw = {
|
||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||
|
|
@ -363,10 +374,15 @@ class Table(object):
|
|||
writer.save()
|
||||
|
||||
def to_html(self, path, **kwargs):
|
||||
"""Write Table to an HTML file.
|
||||
"""Writes Table to an HTML file.
|
||||
|
||||
For kwargs, check :meth:`pandas.DataFrame.to_html`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Output filepath.
|
||||
|
||||
Check `pandas.DataFrame.to_html <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_html.html>`_
|
||||
kwargs for more details around what kwargs to use.
|
||||
"""
|
||||
html_string = self.df.to_html(**kwargs)
|
||||
with open(path, 'w') as f:
|
||||
|
|
@ -434,7 +450,7 @@ class TableList(object):
|
|||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Filepath
|
||||
Output filepath.
|
||||
f : str
|
||||
File format. Can be csv, json, excel and html.
|
||||
compress : bool
|
||||
|
|
|
|||
|
|
@ -13,8 +13,8 @@ class PDFHandler(object):
|
|||
file into single page pdfs, parsing each pdf and then removing the
|
||||
temp directory.
|
||||
|
||||
Parameter
|
||||
---------
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
|
|
|
|||
|
|
@ -30,8 +30,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
|||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
|
|
@ -61,24 +61,24 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
|||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
threshold_blocksize* : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
threshold_constant* : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
iterations* : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
|
|||
|
|
@ -51,8 +51,8 @@ class Lattice(BaseParser):
|
|||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
line_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
|
|
@ -76,7 +76,7 @@ class Lattice(BaseParser):
|
|||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
|
|
|
|||
|
|
@ -35,8 +35,8 @@ class Stream(BaseParser):
|
|||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
row_close_tol : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
|
|
@ -46,7 +46,7 @@ class Stream(BaseParser):
|
|||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
|
|
|
|||
|
|
@ -25,12 +25,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
|||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
geometry_type : str, optional (default: None)
|
||||
'text' : Plot text objects found on page, useful to get
|
||||
table_area and columns coordinates.
|
||||
'table' : Plot parsed table.
|
||||
'contour'* : Plot detected rectangles.
|
||||
'joint'* : Plot detected line intersections.
|
||||
'line'* : Plot detected lines.
|
||||
* 'text' : Plot text objects found on page. (Useful to get \
|
||||
table_area and columns coordinates)
|
||||
* 'table' : Plot parsed table.
|
||||
* 'contour'* : Plot detected rectangles.
|
||||
* 'joint'* : Plot detected line intersections.
|
||||
* 'line'* : Plot detected lines.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to process as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
|
|
@ -43,8 +43,8 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
|||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts.)
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
|
|
@ -74,24 +74,24 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
|||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
threshold_blocksize* : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
threshold_constant* : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
iterations* : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
|
||||
"""
|
||||
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
||||
|
|
|
|||
|
|
@ -454,8 +454,8 @@ def split_textline(table, textline, direction, flag_size=False):
|
|||
Direction of the PDFMiner LTTextLine object.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -530,8 +530,8 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
|||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
|
|||
Loading…
Reference in New Issue