Add docstrings and update docs
parent
16c6b8d45d
commit
9878de4dfc
|
|
@ -8,3 +8,5 @@ dist/
|
|||
.coverage
|
||||
|
||||
.pytest_cache/
|
||||
_build/
|
||||
_static/
|
||||
|
|
|
|||
63
README.md
63
README.md
|
|
@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
|
|||
>>> df = tables[0].df
|
||||
</pre>
|
||||
|
||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||
|
||||
<pre>
|
||||
Camelot: PDF parsing made simpler!
|
||||
|
||||
usage:
|
||||
camelot [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-V, --verbose Verbose.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-J, --split_text Split text lines if they span across multiple cells.
|
||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||
Useful for super and subscripts.
|
||||
-X, --print-stats List stats on the parsing process.
|
||||
-Y, --save-stats Save stats to a file.
|
||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
</pre>
|
||||
|
||||
## Dependencies
|
||||
|
||||
Currently, camelot works under Python 2.7.
|
||||
|
||||
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/).
|
||||
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
|
||||
|
||||
## Installation
|
||||
|
||||
|
|
@ -78,22 +37,22 @@ pip install -U pip setuptools
|
|||
|
||||
### Installing dependencies
|
||||
|
||||
numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager.
|
||||
tk and ghostscript can be installed using your system's default package manager.
|
||||
|
||||
#### Linux
|
||||
|
||||
* Arch Linux
|
||||
|
||||
<pre>
|
||||
sudo pacman -S opencv tk ghostscript
|
||||
</pre>
|
||||
|
||||
* Ubuntu
|
||||
|
||||
<pre>
|
||||
sudo apt-get install python-opencv python-tk ghostscript
|
||||
</pre>
|
||||
|
||||
* Arch Linux
|
||||
|
||||
<pre>
|
||||
sudo pacman -S opencv tk ghostscript
|
||||
</pre>
|
||||
|
||||
#### OS X
|
||||
|
||||
<pre>
|
||||
|
|
@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
|
|||
Finally, `cd` into the project directory and install by
|
||||
|
||||
<pre>
|
||||
make install
|
||||
python setup.py install
|
||||
</pre>
|
||||
|
||||
## Development
|
||||
|
|
@ -118,12 +77,12 @@ git clone https://github.com/socialcopsdev/camelot.git
|
|||
|
||||
### Contributing
|
||||
|
||||
See [Contributing doc]().
|
||||
See [Contributing guidelines]().
|
||||
|
||||
### Testing
|
||||
|
||||
<pre>
|
||||
make test
|
||||
python setup.py test
|
||||
</pre>
|
||||
|
||||
## License
|
||||
|
|
|
|||
186
camelot/core.py
186
camelot/core.py
|
|
@ -8,9 +8,48 @@ import pandas as pd
|
|||
|
||||
|
||||
class Cell(object):
|
||||
"""
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
left-bottom origin. (pdf coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
x-coordinate of left-bottom point.
|
||||
y1 : float
|
||||
y-coordinate of left-bottom point.
|
||||
x2 : float
|
||||
x-coordinate of right-top point.
|
||||
y2 : float
|
||||
y-coordinate of right-top point.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lb : tuple
|
||||
Tuple representing left-bottom coordinates.
|
||||
lt : tuple
|
||||
Tuple representing left-top coordinates.
|
||||
rb : tuple
|
||||
Tuple representing right-bottom coordinates.
|
||||
rt : tuple
|
||||
Tuple representing right-top coordinates.
|
||||
left : bool
|
||||
Whether or not cell is bounded on the left.
|
||||
right : bool
|
||||
Whether or not cell is bounded on the right.
|
||||
top : bool
|
||||
Whether or not cell is bounded on the top.
|
||||
bottom : bool
|
||||
Whether or not cell is bounded on the bottom.
|
||||
hspan : bool
|
||||
Whether or not cell spans horizontally.
|
||||
vspan : bool
|
||||
Whether or not cell spans vertically.
|
||||
text : string
|
||||
Text assigned to cell.
|
||||
bound
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, x1, y1, x2, y2):
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
|
|
@ -34,37 +73,48 @@ class Cell(object):
|
|||
|
||||
@property
|
||||
def text(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
return self._text
|
||||
|
||||
@text.setter
|
||||
def text(self, t):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t
|
||||
"""
|
||||
self._text = ''.join([self._text, t])
|
||||
|
||||
@property
|
||||
def bound(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""The number of sides on which the cell is bounded.
|
||||
"""
|
||||
return self.top + self.bottom + self.left + self.right
|
||||
|
||||
|
||||
class Table(object):
|
||||
"""
|
||||
"""Defines a table with coordinates relative to a left-bottom
|
||||
origin. (pdf coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of tuples representing column x-coordinates in increasing
|
||||
order.
|
||||
rows : list
|
||||
List of tuples representing row y-coordinates in decreasing
|
||||
order.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
df : object
|
||||
pandas.DataFrame
|
||||
shape : tuple
|
||||
Shape of the table.
|
||||
accuracy : float
|
||||
Accuracy with which text was assigned to the cell.
|
||||
whitespace : float
|
||||
Percentage of whitespace in the table.
|
||||
order : int
|
||||
Table number on pdf page.
|
||||
page : int
|
||||
Pdf page number.
|
||||
data
|
||||
parsing_report
|
||||
|
||||
"""
|
||||
def __init__(self, cols, rows):
|
||||
|
|
@ -84,11 +134,7 @@ class Table(object):
|
|||
|
||||
@property
|
||||
def data(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""Returns two-dimensional list of strings in table.
|
||||
"""
|
||||
d = []
|
||||
for row in self.cells:
|
||||
|
|
@ -97,11 +143,8 @@ class Table(object):
|
|||
|
||||
@property
|
||||
def parsing_report(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""Returns a parsing report with accuracy, %whitespace,
|
||||
table number on page and page number.
|
||||
"""
|
||||
# pretty?
|
||||
report = {
|
||||
|
|
@ -112,27 +155,8 @@ class Table(object):
|
|||
}
|
||||
return report
|
||||
|
||||
def set_border(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
self.cells[r][0].left = True
|
||||
self.cells[r][len(self.cols) - 1].right = True
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[0][c].top = True
|
||||
self.cells[len(self.rows) - 1][c].bottom = True
|
||||
return self
|
||||
|
||||
def set_all_edges(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""Sets all table edges to True.
|
||||
"""
|
||||
for row in self.cells:
|
||||
for cell in row:
|
||||
|
|
@ -140,16 +164,16 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def set_edges(self, vertical, horizontal, joint_close_tol=2):
|
||||
"""
|
||||
"""Sets a cell's edges to True depending on whether the cell's
|
||||
coordinates overlap with the line's coordinates within a
|
||||
tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical
|
||||
horizontal
|
||||
joint_close_tol
|
||||
|
||||
Returns
|
||||
-------
|
||||
vertical : list
|
||||
List of detected vertical lines.
|
||||
horizontal : list
|
||||
List of detected horizontal lines.
|
||||
|
||||
"""
|
||||
for v in vertical:
|
||||
|
|
@ -256,12 +280,20 @@ class Table(object):
|
|||
|
||||
return self
|
||||
|
||||
def set_span(self):
|
||||
def set_border(self):
|
||||
"""Sets table border edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
self.cells[r][0].left = True
|
||||
self.cells[r][len(self.cols) - 1].right = True
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[0][c].top = True
|
||||
self.cells[len(self.rows) - 1][c].bottom = True
|
||||
return self
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
def set_span(self):
|
||||
"""Sets a cell's hspan or vspan attribute to True depending
|
||||
on whether the cell spans horizontally or vertically.
|
||||
"""
|
||||
for row in self.cells:
|
||||
for cell in row:
|
||||
|
|
@ -288,6 +320,8 @@ class Table(object):
|
|||
return self
|
||||
|
||||
def to_csv(self, path, **kwargs):
|
||||
"""Write Table to a comma-separated values (csv) file.
|
||||
"""
|
||||
kw = {
|
||||
'encoding': 'utf-8',
|
||||
'index': False,
|
||||
|
|
@ -297,6 +331,8 @@ class Table(object):
|
|||
self.df.to_csv(path, **kw)
|
||||
|
||||
def to_json(self, path, **kwargs):
|
||||
"""Write Table to a JSON file.
|
||||
"""
|
||||
kw = {
|
||||
'orient': 'records'
|
||||
}
|
||||
|
|
@ -306,6 +342,8 @@ class Table(object):
|
|||
f.write(json_string)
|
||||
|
||||
def to_excel(self, path, **kwargs):
|
||||
"""Write Table to an Excel file.
|
||||
"""
|
||||
kw = {
|
||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||
'encoding': 'utf-8'
|
||||
|
|
@ -316,13 +354,21 @@ class Table(object):
|
|||
writer.save()
|
||||
|
||||
def to_html(self, path, **kwargs):
|
||||
"""Write Table to an HTML file.
|
||||
"""
|
||||
html_string = self.df.to_html(**kwargs)
|
||||
with open(path, 'w') as f:
|
||||
f.write(html_string)
|
||||
|
||||
|
||||
class TableList(object):
|
||||
"""
|
||||
"""Defines a list of camelot.core.Table objects. Each table can
|
||||
be accessed using its index.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n : int
|
||||
Number of tables in the list.
|
||||
|
||||
"""
|
||||
def __init__(self, tables):
|
||||
|
|
@ -371,6 +417,18 @@ class TableList(object):
|
|||
z.write(filepath, os.path.basename(filepath))
|
||||
|
||||
def export(self, path, f='csv', compress=False):
|
||||
"""Exports the list of tables to specified file format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Filepath
|
||||
f : str
|
||||
File format. Can be csv, json, excel and html.
|
||||
compress : bool
|
||||
Whether or not to add files to a ZIP archive.
|
||||
|
||||
"""
|
||||
dirname = os.path.dirname(path)
|
||||
basename = os.path.basename(path)
|
||||
root, ext = os.path.splitext(basename)
|
||||
|
|
@ -402,9 +460,6 @@ class TableList(object):
|
|||
|
||||
|
||||
class Geometry(object):
|
||||
"""
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
self.text = []
|
||||
self.images = ()
|
||||
|
|
@ -421,9 +476,6 @@ class Geometry(object):
|
|||
|
||||
|
||||
class GeometryList(object):
|
||||
"""
|
||||
|
||||
"""
|
||||
def __init__(self, geometry):
|
||||
self.text = [g.text for g in geometry]
|
||||
self.images = [g.images for g in geometry]
|
||||
|
|
|
|||
|
|
@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation
|
|||
|
||||
|
||||
class PDFHandler(object):
|
||||
"""
|
||||
"""Handles all operations like temp directory creation, splitting
|
||||
file into single page pdfs, parsing each pdf and then removing the
|
||||
temp directory.
|
||||
|
||||
Parameter
|
||||
---------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
|
||||
"""
|
||||
def __init__(self, filename, pages='1'):
|
||||
self.filename = filename
|
||||
if not self.filename.endswith('.pdf'):
|
||||
raise TypeError("File format not supported.")
|
||||
self.pages = self.__get_pages(self.filename, pages)
|
||||
self.pages = self._get_pages(self.filename, pages)
|
||||
self.tempdir = tempfile.mkdtemp()
|
||||
|
||||
def __get_pages(self, filename, pages):
|
||||
# refactor
|
||||
def _get_pages(self, filename, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
|
||||
Returns
|
||||
-------
|
||||
P : list
|
||||
List of int page numbers.
|
||||
|
||||
"""
|
||||
page_numbers = []
|
||||
if pages == '1':
|
||||
page_numbers.append({'start': 1, 'end': 1})
|
||||
|
|
@ -42,8 +67,19 @@ class PDFHandler(object):
|
|||
P.extend(range(p['start'], p['end'] + 1))
|
||||
return sorted(set(P))
|
||||
|
||||
def __save_page(self, filename, page, temp):
|
||||
# refactor
|
||||
def _save_page(self, filename, page, temp):
|
||||
"""Saves specified page from pdf into a temporary directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
page : int
|
||||
Page number
|
||||
temp : str
|
||||
Tmp directory
|
||||
|
||||
"""
|
||||
with open(filename, 'rb') as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||
|
|
@ -65,28 +101,37 @@ class PDFHandler(object):
|
|||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||
outfile = PdfFileWriter()
|
||||
p = infile.getPage(0)
|
||||
if rotation == 'left':
|
||||
if rotation == 'anticlockwise':
|
||||
p.rotateClockwise(90)
|
||||
elif rotation == 'right':
|
||||
elif rotation == 'clockwise':
|
||||
p.rotateCounterClockwise(90)
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
|
||||
def parse(self, mesh=False, **kwargs):
|
||||
"""
|
||||
"""Extracts tables by calling parser.get_tables on all single
|
||||
page pdfs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mesh
|
||||
kwargs
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
kwargs : dict
|
||||
See camelot.read_pdf kwargs.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : camelot.core.TableList
|
||||
List of tables found in pdf.
|
||||
geometry : camelot.core.GeometryList
|
||||
List of geometry objects (contours, lines, joints)
|
||||
found in pdf.
|
||||
|
||||
"""
|
||||
for p in self.pages:
|
||||
self.__save_page(self.filename, p, self.tempdir)
|
||||
self._save_page(self.filename, p, self.tempdir)
|
||||
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pages]
|
||||
tables = []
|
||||
|
|
|
|||
|
|
@ -9,17 +9,31 @@ from .utils import merge_tuples
|
|||
|
||||
|
||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||
"""
|
||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
imagename
|
||||
process_background
|
||||
blocksize
|
||||
c
|
||||
imagename : string
|
||||
Path to image file.
|
||||
process_background : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
c : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
img : object
|
||||
numpy.ndarray representing the original image.
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
"""
|
||||
img = cv2.imread(imagename)
|
||||
|
|
@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
|
||||
|
||||
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
||||
"""
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold
|
||||
direction
|
||||
line_size_scaling
|
||||
iterations
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
direction : string, optional (default: 'horizontal')
|
||||
Specifies whether to find vertical or horizontal lines.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dmask : object
|
||||
numpy.ndarray representing pixels where vertical/horizontal
|
||||
lines lie.
|
||||
lines : list
|
||||
List of tuples representing vertical/horizontal lines with
|
||||
coordinates relative to a left-top origin in
|
||||
image coordinate space.
|
||||
|
||||
"""
|
||||
lines = []
|
||||
|
|
@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
|||
|
||||
|
||||
def find_table_contours(vertical, horizontal):
|
||||
"""
|
||||
"""Finds table boundaries using OpenCV's findContours.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical
|
||||
horizontal
|
||||
vertical : object
|
||||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cont : list
|
||||
List of tuples representing table boundaries. Each tuple is of
|
||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||
h -> height in image coordinate space.
|
||||
|
||||
"""
|
||||
mask = vertical + horizontal
|
||||
|
|
@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal):
|
|||
|
||||
|
||||
def find_table_joints(contours, vertical, horizontal):
|
||||
"""
|
||||
"""Finds joints/intersections present inside each table boundary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
contours
|
||||
vertical
|
||||
horizontal
|
||||
contours : list
|
||||
List of tuples representing table boundaries. Each tuple is of
|
||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||
h -> height in image coordinate space.
|
||||
vertical : object
|
||||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : dict
|
||||
Dict with table boundaries as keys and list of intersections
|
||||
in that boundary as their value.
|
||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||
and (x2, y2) -> rt in image coordinate space.
|
||||
|
||||
"""
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
|
|
@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
|
||||
|
||||
def remove_lines(threshold, line_size_scaling=15):
|
||||
"""
|
||||
"""Removes lines from a thresholded image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold
|
||||
line_size_scaling
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
|
||||
Returns
|
||||
-------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image
|
||||
with horizontal and vertical lines removed.
|
||||
|
||||
"""
|
||||
size = threshold.shape[0] // line_size_scaling
|
||||
|
|
@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15):
|
|||
|
||||
|
||||
def find_cuts(threshold, char_size_scaling=200):
|
||||
"""
|
||||
"""Finds cuts made by text projections on y-axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold
|
||||
char_size_scaling
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
line_size_scaling : int, optional (default: 200)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
y_cuts : list
|
||||
List of cuts on y-axis.
|
||||
"""
|
||||
size = threshold.shape[0] // char_size_scaling
|
||||
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
|
|
|
|||
|
|
@ -2,20 +2,93 @@ from .handlers import PDFHandler
|
|||
|
||||
|
||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||
"""
|
||||
"""Read PDF and return parsed data tables.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||
and kwargs annotated with * can only be used with mesh=True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath
|
||||
pages
|
||||
mesh
|
||||
kwargs
|
||||
filepath : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol^ : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
process_background* : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text* : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text* : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
line_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : camelot.core.TableList
|
||||
|
||||
"""
|
||||
# explicit type conversion
|
||||
# validate kwargs?
|
||||
p = PDFHandler(filepath, pages)
|
||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
||||
return tables
|
||||
|
|
@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects
|
|||
|
||||
|
||||
class BaseParser(object):
|
||||
"""
|
||||
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
def _generate_layout(self, filename):
|
||||
self.filename = filename
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from .base import BaseParser
|
|||
from ..core import Table
|
||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||
merge_close_lines, get_table_index, compute_accuracy,
|
||||
count_empty_strings, encode_, setup_logging)
|
||||
compute_whitespace, setup_logging, encode_)
|
||||
from ..image_processing import (adaptive_threshold, find_lines,
|
||||
find_table_contours, find_table_joints)
|
||||
|
||||
|
|
@ -20,14 +20,74 @@ logger = setup_logging(__name__)
|
|||
|
||||
|
||||
class Lattice(BaseParser):
|
||||
"""
|
||||
"""Lattice method of parsing looks for lines between text
|
||||
to form a table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
process_background : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
line_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
"""
|
||||
def __init__(self, table_area=None, process_background=False,
|
||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||
split_text=False, flag_size=False, line_close_tol=2,
|
||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||
iterations=0, margins=(1.0, 0.5, 0.1), debug=None):
|
||||
iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
|
||||
self.table_area = table_area
|
||||
self.process_background = process_background
|
||||
self.line_size_scaling = line_size_scaling
|
||||
|
|
@ -45,6 +105,27 @@ class Lattice(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
idx : list
|
||||
List of tuples of the form (r_idx, c_idx, text).
|
||||
shift_text : list
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a
|
||||
list to specify where the text in a spanning cell should
|
||||
flow.
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices : list
|
||||
List of tuples of the form (r_idx, c_idx, text) where
|
||||
r_idx and c_idx are new row and column indices for text.
|
||||
|
||||
"""
|
||||
indices = []
|
||||
for r_idx, c_idx, text in idx:
|
||||
for d in shift_text:
|
||||
|
|
@ -69,6 +150,22 @@ class Lattice(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _copy_spanning_text(t, copy_text=None):
|
||||
"""Copies over text in empty spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : camelot.core.Table
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : camelot.core.Table
|
||||
|
||||
"""
|
||||
for f in copy_text:
|
||||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
|
|
@ -199,7 +296,7 @@ class Lattice(BaseParser):
|
|||
table.df = pd.DataFrame(data)
|
||||
table.shape = table.df.shape
|
||||
|
||||
whitespace, __, __ = count_empty_strings(data)
|
||||
whitespace = compute_whitespace(data)
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
|
|
@ -208,16 +305,6 @@ class Lattice(BaseParser):
|
|||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||
self._generate_layout(filename)
|
||||
|
||||
|
|
@ -237,7 +324,7 @@ class Lattice(BaseParser):
|
|||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||
_tables.append(table)
|
||||
|
||||
if self.debug is not None:
|
||||
if self.debug:
|
||||
text = []
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||
|
|
|
|||
|
|
@ -8,19 +8,54 @@ import pandas as pd
|
|||
from .base import BaseParser
|
||||
from ..core import Table
|
||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||
count_empty_strings, encode_, setup_logging)
|
||||
compute_whitespace, setup_logging, encode_)
|
||||
|
||||
|
||||
logger = setup_logging(__name__)
|
||||
|
||||
|
||||
class Stream(BaseParser):
|
||||
"""
|
||||
"""Stream method of parsing looks for spaces between text
|
||||
to form a table.
|
||||
|
||||
If you want to specify columns when specifying multiple table
|
||||
areas, make sure that the length of both lists are equal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s), columns and debugging.
|
||||
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||
margins=(1.0, 0.5, 0.1), debug=None):
|
||||
margins=(1.0, 0.5, 0.1), debug=False):
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
|
|
@ -33,6 +68,20 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||
|
||||
"""
|
||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||
|
|
@ -42,6 +91,21 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _group_rows(text, row_close_tol=2):
|
||||
"""Groups PDFMiner text objects into rows vertically
|
||||
within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
|
||||
"""
|
||||
row_y = 0
|
||||
rows = []
|
||||
temp = []
|
||||
|
|
@ -61,6 +125,21 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _merge_columns(l, col_close_tol=0):
|
||||
"""Merges column boundaries horizontally if they overlap
|
||||
or lie within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
l : list
|
||||
List of column x-coordinate tuples.
|
||||
col_close_tol : int, optional (default: 0)
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
List of merged column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
|
|
@ -89,6 +168,21 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||
"""Makes row coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rows_grouped : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
text_y_max : int
|
||||
text_y_min : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
List of continuous row y-coordinate tuples.
|
||||
|
||||
"""
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
|
|
@ -100,6 +194,23 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _add_columns(cols, text, row_close_tol):
|
||||
"""Adds columns to existing list by taking into account
|
||||
the text that lies outside the current column x-coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
ytol : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
if text:
|
||||
text = Stream._group_rows(text, row_close_tol=row_close_tol)
|
||||
elements = [len(r) for r in text]
|
||||
|
|
@ -110,6 +221,21 @@ class Stream(BaseParser):
|
|||
|
||||
@staticmethod
|
||||
def _join_columns(cols, text_x_min, text_x_max):
|
||||
"""Makes column coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text_x_min : int
|
||||
text_y_max : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, text_x_min)
|
||||
|
|
@ -207,7 +333,7 @@ class Stream(BaseParser):
|
|||
table.df = pd.DataFrame(data)
|
||||
table.shape = table.df.shape
|
||||
|
||||
whitespace, __, __ = count_empty_strings(data)
|
||||
whitespace = compute_whitespace(data)
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
|
|
@ -216,16 +342,6 @@ class Stream(BaseParser):
|
|||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||
self._generate_layout(filename)
|
||||
|
||||
|
|
@ -244,7 +360,7 @@ class Stream(BaseParser):
|
|||
table = self._generate_table(table_idx, cols, rows)
|
||||
_tables.append(table)
|
||||
|
||||
if self.debug is not None:
|
||||
if self.debug:
|
||||
text = []
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||
|
|
|
|||
|
|
@ -6,19 +6,101 @@ from .handlers import PDFHandler
|
|||
|
||||
|
||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
||||
"""
|
||||
"""Plot geometry found on pdf page based on type specified,
|
||||
useful for debugging and playing with different parameters to get
|
||||
the best output.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||
and kwargs annotated with * can only be used with mesh=True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath
|
||||
pages
|
||||
mesh
|
||||
geometry_type
|
||||
kwargs
|
||||
filepath : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
geometry_type : str, optional (default: 'text')
|
||||
'text' : Plot text objects found on page, useful to get
|
||||
table_area and columns coordinates.
|
||||
'table' : Plot parsed table.
|
||||
'contour'* : Plot detected rectangles.
|
||||
'joint'* : Plot detected line intersections.
|
||||
'line'* : Plot detected lines.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol^ : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
process_background* : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text* : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text* : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
line_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
"""
|
||||
# explicit type conversion
|
||||
# validate kwargs?
|
||||
p = PDFHandler(filepath, pages)
|
||||
kwargs.update({'debug': geometry_type})
|
||||
debug = True if geometry_type else False
|
||||
kwargs.update({'debug': debug})
|
||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
||||
|
||||
if geometry_type == 'text':
|
||||
|
|
|
|||
356
camelot/utils.py
356
camelot/utils.py
|
|
@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
|||
|
||||
|
||||
def setup_logging(name):
|
||||
"""
|
||||
"""Sets up a logger with StreamHandler.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name
|
||||
name : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
logger : logging.Logger
|
||||
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
|
|
@ -47,15 +48,16 @@ logger = setup_logging(__name__)
|
|||
|
||||
|
||||
def translate(x1, x2):
|
||||
"""
|
||||
"""Translates x2 by x1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1
|
||||
x2
|
||||
x1 : float
|
||||
x2 : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x2 : float
|
||||
|
||||
"""
|
||||
x2 += x1
|
||||
|
|
@ -63,15 +65,16 @@ def translate(x1, x2):
|
|||
|
||||
|
||||
def scale(x, s):
|
||||
"""
|
||||
"""Scales x by scaling factor s.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x
|
||||
s
|
||||
x : float
|
||||
s : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : float
|
||||
|
||||
"""
|
||||
x *= s
|
||||
|
|
@ -79,18 +82,21 @@ def scale(x, s):
|
|||
|
||||
|
||||
def rotate(x1, y1, x2, y2, angle):
|
||||
"""
|
||||
"""Rotates point x2, y2 about point x1, y1 by angle.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1
|
||||
y1
|
||||
x2
|
||||
y2
|
||||
angle
|
||||
x1 : float
|
||||
y1 : float
|
||||
x2 : float
|
||||
y2 : float
|
||||
angle : float
|
||||
Angle in radians.
|
||||
|
||||
Returns
|
||||
-------
|
||||
xnew : float
|
||||
ynew : float
|
||||
|
||||
"""
|
||||
s = np.sin(angle)
|
||||
|
|
@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle):
|
|||
|
||||
|
||||
def scale_pdf(k, factors):
|
||||
"""
|
||||
"""Translates and scales pdf coordinate space to image
|
||||
coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k
|
||||
factors
|
||||
k : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
|
||||
space.
|
||||
factors : tuple
|
||||
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
|
||||
first two elements are scaling factors and pdf_y is height of
|
||||
pdf.
|
||||
|
||||
Returns
|
||||
-------
|
||||
knew : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
|
||||
space.
|
||||
|
||||
"""
|
||||
x1, y1, x2, y2 = k
|
||||
|
|
@ -127,17 +144,28 @@ def scale_pdf(k, factors):
|
|||
|
||||
|
||||
def scale_image(tables, v_segments, h_segments, factors):
|
||||
"""
|
||||
"""Translates and scales image coordinate space to pdf
|
||||
coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tables
|
||||
v_segments
|
||||
h_segments
|
||||
factors
|
||||
tables : dict
|
||||
Dict with table boundaries as keys and list of intersections
|
||||
in that boundary as value.
|
||||
v_segments : list
|
||||
List of vertical line segments.
|
||||
h_segments : list
|
||||
List of horizontal line segments.
|
||||
factors : tuple
|
||||
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
|
||||
first two elements are scaling factors and img_y is height of
|
||||
image.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables_new : dict
|
||||
v_segments_new : dict
|
||||
h_segments_new : dict
|
||||
|
||||
"""
|
||||
scaling_factor_x, scaling_factor_y, img_y = factors
|
||||
|
|
@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors):
|
|||
|
||||
|
||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||
"""
|
||||
"""Detects if text in table is rotated or not using the current
|
||||
transformation matrix (CTM) and returns its orientation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lttextlh
|
||||
lttextlv
|
||||
ltchar
|
||||
lttextlh : list
|
||||
List of PDFMiner LTTextLineHorizontal objects.
|
||||
lttextlv : list
|
||||
List of PDFMiner LTTextLineVertical objects.
|
||||
ltchar : list
|
||||
List of PDFMiner LTChar objects.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rotation : string
|
||||
'' if text in table is upright, 'left' if rotated 90 degree
|
||||
anticlockwise and 'right' if rotated 90 degree clockwise.
|
||||
|
||||
"""
|
||||
rotation = ''
|
||||
|
|
@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
|||
if hlen < vlen:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||
rotation = 'left' if clockwise < anticlockwise else 'right'
|
||||
rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise'
|
||||
return rotation
|
||||
|
||||
|
||||
def segments_in_bbox(bbox, v_segments, h_segments):
|
||||
"""
|
||||
"""Returns all line segments present inside a bounding box.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bbox
|
||||
v_segments
|
||||
h_segments
|
||||
bbox : tuple
|
||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||
space.
|
||||
v_segments : list
|
||||
List of vertical line segments.
|
||||
h_segments : list
|
||||
List of vertical horizontal segments.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v_s : list
|
||||
List of vertical line segments that lie inside table.
|
||||
h_s : list
|
||||
List of horizontal line segments that lie inside table.
|
||||
|
||||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
|
|
@ -217,15 +261,20 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
|||
|
||||
|
||||
def text_in_bbox(bbox, text):
|
||||
"""
|
||||
"""Returns all text objects present inside a bounding box.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bbox
|
||||
text
|
||||
bbox : tuple
|
||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||
space.
|
||||
text : List of PDFMiner text objects.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t_bbox : list
|
||||
List of PDFMiner text objects that lie inside table.
|
||||
|
||||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
|
|
@ -237,15 +286,17 @@ def text_in_bbox(bbox, text):
|
|||
|
||||
|
||||
def remove_close_lines(ar, line_close_tol=2):
|
||||
"""
|
||||
"""Removes lines which are within a tolerance, based on their x or
|
||||
y axis projections.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar
|
||||
line_close_tol
|
||||
ar : list
|
||||
line_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
|
||||
"""
|
||||
ret = []
|
||||
|
|
@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2):
|
|||
|
||||
|
||||
def merge_close_lines(ar, line_close_tol=2):
|
||||
"""
|
||||
"""Merges lines which are within a tolerance by calculating a
|
||||
moving mean, based on their x or y axis projections.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar
|
||||
line_close_tol
|
||||
ar : list
|
||||
line_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
|
||||
"""
|
||||
ret = []
|
||||
|
|
@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2):
|
|||
|
||||
|
||||
def flag_font_size(textline, direction):
|
||||
"""
|
||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||
May give false positives.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
textline
|
||||
direction
|
||||
textline : list
|
||||
List of PDFMiner LTChar objects.
|
||||
direction : string
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fstring : string
|
||||
|
||||
"""
|
||||
if direction == 'horizontal':
|
||||
|
|
@ -324,18 +381,27 @@ def flag_font_size(textline, direction):
|
|||
return fstring
|
||||
|
||||
|
||||
def split_textline(table, textline, direction, flag_size=True):
|
||||
"""
|
||||
def split_textline(table, textline, direction, flag_size=False):
|
||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||
multiple rows/columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table
|
||||
textline
|
||||
direction
|
||||
flag_size
|
||||
table : camelot.core.Table
|
||||
textline : object
|
||||
PDFMiner LTTextLine object.
|
||||
direction : string
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
|
||||
Returns
|
||||
-------
|
||||
grouped_chars : list
|
||||
List of tuples of the form (idx, text) where idx is the index
|
||||
of row/column and text is the an lttextline substring.
|
||||
|
||||
"""
|
||||
idx = 0
|
||||
|
|
@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True):
|
|||
return grouped_chars
|
||||
|
||||
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||
"""
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||
"""Gets indices of the table cell where given text object lies by
|
||||
comparing their y and x-coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table
|
||||
t
|
||||
direction
|
||||
split_text
|
||||
flag_size
|
||||
table : camelot.core.Table
|
||||
t : object
|
||||
PDFMiner LTTextLine object.
|
||||
direction : string
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices : list
|
||||
List of tuples of the form (r_idx, c_idx, text) where r_idx
|
||||
and c_idx are row and column indices.
|
||||
error : float
|
||||
Assignment error, percentage of text area that lies outside
|
||||
a cell.
|
||||
+-------+
|
||||
| |
|
||||
| [Text bounding box]
|
||||
| |
|
||||
+-------+
|
||||
|
||||
"""
|
||||
r_idx, c_idx = [-1] * 2
|
||||
|
|
@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
|||
|
||||
|
||||
def compute_accuracy(error_weights):
|
||||
"""
|
||||
"""Calculates a score based on weights assigned to various
|
||||
parameters and their error percentages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
error_weights
|
||||
error_weights : list
|
||||
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
|
||||
where pn is the weight assigned to list of errors en.
|
||||
Sum of pn should be equal to 100.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
|
||||
"""
|
||||
SCORE_VAL = 100
|
||||
|
|
@ -474,50 +564,40 @@ def compute_accuracy(error_weights):
|
|||
return score
|
||||
|
||||
|
||||
def count_empty_strings(d):
|
||||
"""
|
||||
def compute_whitespace(d):
|
||||
"""Calculates the percentage of empty strings in a
|
||||
two-dimensional list.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
whitespace : float
|
||||
Percentage of empty cells.
|
||||
|
||||
"""
|
||||
empty_p = 0
|
||||
whitespace = 0
|
||||
r_nempty_cells, c_nempty_cells = [], []
|
||||
for i in d:
|
||||
for j in i:
|
||||
if j.strip() == '':
|
||||
empty_p += 1
|
||||
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
|
||||
for row in d:
|
||||
r_nempty_c = 0
|
||||
for r in row:
|
||||
if r.strip() != '':
|
||||
r_nempty_c += 1
|
||||
r_nempty_cells.append(r_nempty_c)
|
||||
d = zip(*d)
|
||||
d = [list(col) for col in d]
|
||||
for col in d:
|
||||
c_nempty_c = 0
|
||||
for c in col:
|
||||
if c.strip() != '':
|
||||
c_nempty_c += 1
|
||||
c_nempty_cells.append(c_nempty_c)
|
||||
return empty_p, r_nempty_cells, c_nempty_cells
|
||||
whitespace += 1
|
||||
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
||||
return whitespace
|
||||
|
||||
|
||||
def remove_empty_strings(d):
|
||||
"""
|
||||
def remove_empty(d):
|
||||
"""Removes empty rows and columns from a two-dimensional list.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
d : list
|
||||
|
||||
"""
|
||||
for i, row in enumerate(d):
|
||||
|
|
@ -530,70 +610,46 @@ def remove_empty_strings(d):
|
|||
|
||||
|
||||
def encode_(ar):
|
||||
"""
|
||||
"""Encodes two-dimensional list into unicode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar
|
||||
ar : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
ar : list
|
||||
|
||||
"""
|
||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||
return ar
|
||||
|
||||
|
||||
def get_text_objects(layout, ltype="char", t=None):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout
|
||||
ltype
|
||||
t
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
if ltype == "char":
|
||||
LTObject = LTChar
|
||||
elif ltype == "lh":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == "lv":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += get_text_objects(obj, ltype=ltype)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||
detect_vertical=True, all_texts=True):
|
||||
"""
|
||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||
of kwargs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pname
|
||||
char_margin
|
||||
line_margin
|
||||
word_margin
|
||||
detect_vertical
|
||||
all_texts
|
||||
filename : string
|
||||
Path to pdf file.
|
||||
char_margin : float
|
||||
line_margin : float
|
||||
word_margin : float
|
||||
detect_vertical : bool
|
||||
all_texts : bool
|
||||
|
||||
Returns
|
||||
-------
|
||||
layout : object
|
||||
PDFMiner LTPage object.
|
||||
dim : tuple
|
||||
Dimension of pdf page in the form (width, height).
|
||||
|
||||
"""
|
||||
with open(pname, 'r') as f:
|
||||
with open(filename, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
|
|
@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|||
return layout, dim
|
||||
|
||||
|
||||
def merge_tuples(tuples):
|
||||
"""
|
||||
def get_text_objects(layout, ltype="char", t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
PDFMiner text objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuples
|
||||
layout : object
|
||||
PDFMiner LTPage object.
|
||||
ltype : string
|
||||
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
||||
and LTTextLineVertical objects respectively.
|
||||
t : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
List of PDFMiner text objects.
|
||||
|
||||
"""
|
||||
if ltype == "char":
|
||||
LTObject = LTChar
|
||||
elif ltype == "lh":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == "lv":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += get_text_objects(obj, ltype=ltype)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def merge_tuples(tuples):
|
||||
"""Merges a list of overlapping tuples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuples : list
|
||||
List of tuples where a tuple is a single axis coordinate pair.
|
||||
|
||||
Yields
|
||||
------
|
||||
tuple
|
||||
|
||||
"""
|
||||
merged = list(tuples[0])
|
||||
for s, e in tuples:
|
||||
|
|
|
|||
38
docs/api.rst
38
docs/api.rst
|
|
@ -4,17 +4,37 @@
|
|||
API Reference
|
||||
=============
|
||||
|
||||
Pdf
|
||||
===
|
||||
.. automodule:: camelot.pdf
|
||||
camelot.read_pdf
|
||||
================
|
||||
.. automodule:: camelot.read_pdf
|
||||
:members:
|
||||
|
||||
Lattice
|
||||
=======
|
||||
.. automodule:: camelot.lattice
|
||||
camelot.handlers.PDFHandler
|
||||
===========================
|
||||
.. automodule:: camelot.handlers.PDFHandler
|
||||
:members:
|
||||
|
||||
Stream
|
||||
======
|
||||
.. automodule:: camelot.stream
|
||||
camelot.parsers.Stream
|
||||
======================
|
||||
.. automodule:: camelot.parsers.Stream
|
||||
:members:
|
||||
|
||||
camelot.parsers.Lattice
|
||||
=======================
|
||||
.. automodule:: camelot.parsers.Lattice
|
||||
:members:
|
||||
|
||||
camelot.core.Cell
|
||||
=================
|
||||
.. automodule:: camelot.core.Cell
|
||||
:members:
|
||||
|
||||
camelot.core.Table
|
||||
==================
|
||||
.. automodule:: camelot.core.Table
|
||||
:members:
|
||||
|
||||
camelot.core.TableList
|
||||
======================
|
||||
.. automodule:: camelot.core.TableList
|
||||
:members:
|
||||
100
docs/index.rst
100
docs/index.rst
|
|
@ -3,11 +3,11 @@
|
|||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
==================================
|
||||
Camelot: pdf parsing made simpler!
|
||||
==================================
|
||||
=====================================
|
||||
Camelot: PDF Table Parsing for Humans
|
||||
=====================================
|
||||
|
||||
Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files.
|
||||
Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
|
||||
|
||||
Why another pdf table parsing library?
|
||||
======================================
|
||||
|
|
@ -32,12 +32,22 @@ Usage
|
|||
|
||||
::
|
||||
|
||||
>>> from camelot.pdf import Pdf
|
||||
>>> from camelot.lattice import Lattice
|
||||
|
||||
>>> manager = Pdf(Lattice(), 'us-030.pdf')
|
||||
>>> tables = manager.extract()
|
||||
>>> print tables['page-1']['table-1']['data']
|
||||
>>> import camelot
|
||||
>>> tables = camelot.read_pdf("foo.pdf")
|
||||
>>> tables
|
||||
<TableList n=2>
|
||||
>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
|
||||
>>> tables[0]
|
||||
<Table shape=(3,4)>
|
||||
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||
>>> tables[0].parsing_report
|
||||
{
|
||||
"accuracy": 96,
|
||||
"whitespace": 80,
|
||||
"order": 1,
|
||||
"page": 1
|
||||
}
|
||||
>>> df = tables[0].df
|
||||
|
||||
.. csv-table::
|
||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||
|
|
@ -49,45 +59,6 @@ Usage
|
|||
"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
|
||||
"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
|
||||
|
||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||
|
||||
::
|
||||
|
||||
Camelot: PDF parsing made simpler!
|
||||
|
||||
usage:
|
||||
camelot [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-V, --verbose Verbose.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 1.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-J, --split_text Split text lines if they span across multiple cells.
|
||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||
Useful for super and subscripts.
|
||||
-X, --print-stats List stats on the parsing process.
|
||||
-Y, --save-stats Save stats to a file.
|
||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
|
|
@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can
|
|||
|
||||
pip install -U pip setuptools
|
||||
|
||||
The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_.
|
||||
The dependencies include `tk`_ and `ghostscript`_.
|
||||
|
||||
.. _numpy: http://www.numpy.org/
|
||||
.. _OpenCV: http://opencv.org/
|
||||
.. _ImageMagick: http://www.imagemagick.org/script/index.php
|
||||
.. _tk: https://wiki.tcl.tk/3743
|
||||
.. _ghostscript: https://www.ghostscript.com/
|
||||
|
||||
Installing dependencies
|
||||
-----------------------
|
||||
|
||||
numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
|
||||
tk and ghostscript can be installed using your system's default package manager.
|
||||
|
||||
Linux
|
||||
^^^^^
|
||||
|
||||
* Arch Linux
|
||||
|
||||
::
|
||||
|
||||
sudo pacman -S opencv imagemagick
|
||||
|
||||
* Ubuntu
|
||||
|
||||
::
|
||||
|
||||
sudo apt-get install libopencv-dev python-opencv imagemagick
|
||||
sudo apt-get install python-opencv python-tk ghostscript
|
||||
|
||||
* Arch Linux
|
||||
|
||||
::
|
||||
|
||||
sudo pacman -S opencv tk ghostscript
|
||||
|
||||
OS X
|
||||
^^^^
|
||||
|
||||
::
|
||||
|
||||
brew install homebrew/science/opencv imagemagick
|
||||
brew install homebrew/science/opencv ghostscript
|
||||
|
||||
Finally, `cd` into the project directory and install by::
|
||||
|
||||
make install
|
||||
python setup.py install
|
||||
|
||||
API Reference
|
||||
=============
|
||||
|
|
@ -150,14 +120,14 @@ You can check the latest sources with the command::
|
|||
Contributing
|
||||
------------
|
||||
|
||||
See :doc:`Contributing doc <contributing>`.
|
||||
See :doc:`Contributing guidelines <contributing>`.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
::
|
||||
|
||||
make test
|
||||
python setup.py test
|
||||
|
||||
License
|
||||
=======
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
click==6.7
|
||||
matplotlib==2.2.3
|
||||
numpy==1.13.3
|
||||
opencv-python==3.4.2.17
|
||||
pandas==0.23.4
|
||||
pdfminer==20140328
|
||||
Pillow==5.2.0
|
||||
PyPDF2==1.26.0
|
||||
pytest==3.8.0
|
||||
pytest-runner==4.2
|
||||
Sphinx==1.8.0b1
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
docopt==0.6.2
|
||||
click==6.7
|
||||
matplotlib==2.2.3
|
||||
nose==1.3.7
|
||||
numpy==1.13.3
|
||||
opencv-python==3.4.2.17
|
||||
pandas==0.23.4
|
||||
pdfminer==20140328
|
||||
pyexcel-xlsx==0.5.6
|
||||
Pillow==5.2.0
|
||||
PyPDF2==1.26.0
|
||||
Sphinx==1.8.0b1
|
||||
16
setup.py
16
setup.py
|
|
@ -4,12 +4,12 @@ import camelot
|
|||
|
||||
NAME = 'camelot'
|
||||
VERSION = camelot.__version__
|
||||
DESCRIPTION = 'camelot parses tables from PDFs!'
|
||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||
with open('README.md') as f:
|
||||
LONG_DESCRIPTION = f.read()
|
||||
URL = 'https://github.com/socialcopsdev/camelot'
|
||||
AUTHOR = 'Vinayak Mehta'
|
||||
AUTHOR_EMAIL = 'vinayak@socialcops.com'
|
||||
AUTHOR_EMAIL = 'vmehta94@gmail.com'
|
||||
LICENSE = 'BSD License'
|
||||
|
||||
opencv_min_version = '2.4.8'
|
||||
|
|
@ -58,18 +58,14 @@ def setup_package():
|
|||
|
||||
opencv_status = get_opencv_status()
|
||||
opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
|
||||
instructions = ("Installation instructions are available in the README at "
|
||||
"https://github.com/socialcopsdev/camelot")
|
||||
|
||||
if opencv_status['up_to_date'] is False:
|
||||
if opencv_status['version']:
|
||||
raise ImportError("Your installation of OpenCV "
|
||||
"{0} is out-of-date.\n{1}{2}"
|
||||
.format(opencv_status['version'],
|
||||
opencv_req_str, instructions))
|
||||
raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
|
||||
.format(opencv_status['version'], opencv_req_str))
|
||||
else:
|
||||
raise ImportError("OpenCV is not installed.\n{0}{1}"
|
||||
.format(opencv_req_str, instructions))
|
||||
raise ImportError("OpenCV is not installed.\n{}"
|
||||
.format(opencv_req_str))
|
||||
|
||||
setup(**metadata)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue