Fix docstrings and interlinks

pull/2/head
Vinayak Mehta 2018-09-11 07:35:30 +05:30
parent 3713c08642
commit 17ea5f335e
7 changed files with 65 additions and 49 deletions

View File

@ -46,7 +46,6 @@ class Cell(object):
Whether or not cell spans vertically.
text : string
Text assigned to cell.
bound
"""
@ -101,8 +100,7 @@ class Table(object):
Attributes
----------
df : object
pandas.DataFrame
df : :class:`pandas.DataFrame`
shape : tuple
Shape of the table.
accuracy : float
@ -113,8 +111,6 @@ class Table(object):
Table number on pdf page.
page : int
Pdf page number.
data
parsing_report
"""
def __init__(self, cols, rows):
@ -143,7 +139,7 @@ class Table(object):
@property
def parsing_report(self):
"""Returns a parsing report with accuracy, %whitespace,
"""Returns a parsing report with %accuracy, %whitespace,
table number on page and page number.
"""
# pretty?
@ -320,10 +316,15 @@ class Table(object):
return self
def to_csv(self, path, **kwargs):
"""Write Table to a comma-separated values (csv) file.
"""Writes Table to a comma-separated values (csv) file.
For kwargs, check :meth:`pandas.DataFrame.to_csv`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_csv <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html>`_
kwargs for more details around what kwargs to use.
"""
kw = {
'encoding': 'utf-8',
@ -334,10 +335,15 @@ class Table(object):
self.df.to_csv(path, **kw)
def to_json(self, path, **kwargs):
"""Write Table to a JSON file.
"""Writes Table to a JSON file.
For kwargs, check :meth:`pandas.DataFrame.to_json`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_json <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html>`_
kwargs for more details around what kwargs to use.
"""
kw = {
'orient': 'records'
@ -348,10 +354,15 @@ class Table(object):
f.write(json_string)
def to_excel(self, path, **kwargs):
"""Write Table to an Excel file.
"""Writes Table to an Excel file.
For kwargs, check :meth:`pandas.DataFrame.to_excel`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_excel <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html>`_
kwargs for more details around what kwargs to use.
"""
kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
@ -363,10 +374,15 @@ class Table(object):
writer.save()
def to_html(self, path, **kwargs):
"""Write Table to an HTML file.
"""Writes Table to an HTML file.
For kwargs, check :meth:`pandas.DataFrame.to_html`.
Parameters
----------
path : str
Output filepath.
Check `pandas.DataFrame.to_html <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_html.html>`_
kwargs for more details around what kwargs to use.
"""
html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f:
@ -434,7 +450,7 @@ class TableList(object):
Parameters
----------
path : str
Filepath
Output filepath.
f : str
File format. Can be csv, json, excel and html.
compress : bool

View File

@ -13,8 +13,8 @@ class PDFHandler(object):
file into single page pdfs, parsing each pdf and then removing the
temp directory.
Parameter
---------
Parameters
----------
filename : str
Path to pdf file.
pages : str

View File

@ -30,8 +30,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts)
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
@ -61,24 +61,24 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2)
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0)
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
Returns
-------

View File

@ -51,8 +51,8 @@ class Lattice(BaseParser):
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts)
line_close_tol : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
@ -76,7 +76,7 @@ class Lattice(BaseParser):
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get

View File

@ -35,8 +35,8 @@ class Stream(BaseParser):
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts)
row_close_tol : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
@ -46,7 +46,7 @@ class Stream(BaseParser):
margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get

View File

@ -25,12 +25,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
Whether or not to use Lattice method of parsing. Stream
is used by default.
geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get
table_area and columns coordinates.
'table' : Plot parsed table.
'contour'* : Plot detected rectangles.
'joint'* : Plot detected line intersections.
'line'* : Plot detected lines.
* 'text' : Plot text objects found on page. (Useful to get \
table_area and columns coordinates)
* 'table' : Plot parsed table.
* 'contour'* : Plot detected rectangles.
* 'joint'* : Plot detected line intersections.
* 'line'* : Plot detected lines.
table_area : list, optional (default: None)
List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
@ -43,8 +43,8 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts.)
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
@ -74,24 +74,24 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2)
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0)
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
"""
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)

View File

@ -454,8 +454,8 @@ def split_textline(table, textline, direction, flag_size=False):
Direction of the PDFMiner LTTextLine object.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts.)
Returns
-------
@ -530,8 +530,8 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
if its size is different from rest of the string. (Useful for
super and subscripts)
Returns
-------