Fix docstrings and interlinks

pull/2/head
Vinayak Mehta 2018-09-11 07:35:30 +05:30
parent 72a22dbd06
commit f72e83a1b3
7 changed files with 65 additions and 49 deletions

View File

@ -46,7 +46,6 @@ class Cell(object):
Whether or not cell spans vertically. Whether or not cell spans vertically.
text : string text : string
Text assigned to cell. Text assigned to cell.
bound
""" """
@ -101,8 +100,7 @@ class Table(object):
Attributes Attributes
---------- ----------
df : object df : :class:`pandas.DataFrame`
pandas.DataFrame
shape : tuple shape : tuple
Shape of the table. Shape of the table.
accuracy : float accuracy : float
@ -113,8 +111,6 @@ class Table(object):
Table number on pdf page. Table number on pdf page.
page : int page : int
Pdf page number. Pdf page number.
data
parsing_report
""" """
def __init__(self, cols, rows): def __init__(self, cols, rows):
@ -143,7 +139,7 @@ class Table(object):
@property @property
def parsing_report(self): def parsing_report(self):
"""Returns a parsing report with accuracy, %whitespace, """Returns a parsing report with %accuracy, %whitespace,
table number on page and page number. table number on page and page number.
""" """
# pretty? # pretty?
@ -320,10 +316,15 @@ class Table(object):
return self return self
def to_csv(self, path, **kwargs): def to_csv(self, path, **kwargs):
"""Write Table to a comma-separated values (csv) file. """Writes Table to a comma-separated values (csv) file.
For kwargs, check :meth:`pandas.DataFrame.to_csv`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_csv <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html>`_
kwargs for more details around what kwargs to use.
""" """
kw = { kw = {
'encoding': 'utf-8', 'encoding': 'utf-8',
@ -334,10 +335,15 @@ class Table(object):
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
def to_json(self, path, **kwargs): def to_json(self, path, **kwargs):
"""Write Table to a JSON file. """Writes Table to a JSON file.
For kwargs, check :meth:`pandas.DataFrame.to_json`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_json <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html>`_
kwargs for more details around what kwargs to use.
""" """
kw = { kw = {
'orient': 'records' 'orient': 'records'
@ -348,10 +354,15 @@ class Table(object):
f.write(json_string) f.write(json_string)
def to_excel(self, path, **kwargs): def to_excel(self, path, **kwargs):
"""Write Table to an Excel file. """Writes Table to an Excel file.
For kwargs, check :meth:`pandas.DataFrame.to_excel`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_excel <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html>`_
kwargs for more details around what kwargs to use.
""" """
kw = { kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
@ -363,10 +374,15 @@ class Table(object):
writer.save() writer.save()
def to_html(self, path, **kwargs): def to_html(self, path, **kwargs):
"""Write Table to an HTML file. """Writes Table to an HTML file.
For kwargs, check :meth:`pandas.DataFrame.to_html`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_html <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_html.html>`_
kwargs for more details around what kwargs to use.
""" """
html_string = self.df.to_html(**kwargs) html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f: with open(path, 'w') as f:
@ -434,7 +450,7 @@ class TableList(object):
Parameters Parameters
---------- ----------
path : str path : str
Filepath Output filepath.
f : str f : str
File format. Can be csv, json, excel and html. File format. Can be csv, json, excel and html.
compress : bool compress : bool

View File

@ -13,8 +13,8 @@ class PDFHandler(object):
file into single page pdfs, parsing each pdf and then removing the file into single page pdfs, parsing each pdf and then removing the
temp directory. temp directory.
Parameter Parameters
--------- ----------
filename : str filename : str
Path to pdf file. Path to pdf file.
pages : str pages : str

View File

@ -30,8 +30,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
multiple cells. multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts)
row_close_tol^ : int, optional (default: 2) row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically Rows will be formed by combining text vertically
within this tolerance. within this tolerance.
@ -61,24 +61,24 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
joint_close_tol* : int, optional (default: 2) joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize : int, optional (default: 15) threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2) threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0) iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
Returns Returns
------- -------

View File

@ -51,8 +51,8 @@ class Lattice(BaseParser):
multiple cells. multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts)
line_close_tol : int, optional (default: 2) line_close_tol : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other. detected lines which lie close to each other.
@ -76,7 +76,7 @@ class Lattice(BaseParser):
margins : tuple margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False) debug : bool, optional (default: False)
Whether or not to return all text objects on the page Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get which can be used to generate a matplotlib plot, to get

View File

@ -35,8 +35,8 @@ class Stream(BaseParser):
multiple cells. multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts)
row_close_tol : int, optional (default: 2) row_close_tol : int, optional (default: 2)
Rows will be formed by combining text vertically Rows will be formed by combining text vertically
within this tolerance. within this tolerance.
@ -46,7 +46,7 @@ class Stream(BaseParser):
margins : tuple, optional (default: (1.0, 0.5, 0.1)) margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False) debug : bool, optional (default: False)
Whether or not to return all text objects on the page Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get which can be used to generate a matplotlib plot, to get

View File

@ -25,12 +25,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
Whether or not to use Lattice method of parsing. Stream Whether or not to use Lattice method of parsing. Stream
is used by default. is used by default.
geometry_type : str, optional (default: None) geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get * 'text' : Plot text objects found on page. (Useful to get \
table_area and columns coordinates. table_area and columns coordinates)
'table' : Plot parsed table. * 'table' : Plot parsed table.
'contour'* : Plot detected rectangles. * 'contour'* : Plot detected rectangles.
'joint'* : Plot detected line intersections. * 'joint'* : Plot detected line intersections.
'line'* : Plot detected lines. * 'line'* : Plot detected lines.
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to process as strings of the form List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and x1,y1,x2,y2 where (x1, y1) -> left-top and
@ -43,8 +43,8 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
multiple cells. multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts.)
row_close_tol^ : int, optional (default: 2) row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically Rows will be formed by combining text vertically
within this tolerance. within this tolerance.
@ -74,24 +74,24 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
joint_close_tol* : int, optional (default: 2) joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize : int, optional (default: 15) threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2) threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0) iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
""" """
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type) validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)

View File

@ -454,8 +454,8 @@ def split_textline(table, textline, direction, flag_size=False):
Direction of the PDFMiner LTTextLine object. Direction of the PDFMiner LTTextLine object.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts.)
Returns Returns
------- -------
@ -530,8 +530,8 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
multiple cells. multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for if its size is different from rest of the string. (Useful for
super and subscripts. super and subscripts)
Returns Returns
------- -------