Add docstrings and update docs
parent
16c6b8d45d
commit
9878de4dfc
|
|
@ -8,3 +8,5 @@ dist/
|
||||||
.coverage
|
.coverage
|
||||||
|
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
_build/
|
||||||
|
_static/
|
||||||
|
|
|
||||||
65
README.md
65
README.md
|
|
@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
|
||||||
>>> df = tables[0].df
|
>>> df = tables[0].df
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
Camelot: PDF parsing made simpler!
|
|
||||||
|
|
||||||
usage:
|
|
||||||
camelot [options] <method> [<args>...]
|
|
||||||
|
|
||||||
options:
|
|
||||||
-h, --help Show this screen.
|
|
||||||
-v, --version Show version.
|
|
||||||
-V, --verbose Verbose.
|
|
||||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
|
||||||
Example: -p 1,3-6,10 [default: 1]
|
|
||||||
-P, --parallel Parallelize the parsing process.
|
|
||||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
|
||||||
-l, --log Log to file.
|
|
||||||
-o, --output <directory> Output directory.
|
|
||||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
|
||||||
grouped together to form a word. [default: 2.0]
|
|
||||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
||||||
grouped together to form a textbox. [default: 0.5]
|
|
||||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
||||||
if distance between words is greater than word
|
|
||||||
margin. [default: 0.1]
|
|
||||||
-J, --split_text Split text lines if they span across multiple cells.
|
|
||||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
|
||||||
Useful for super and subscripts.
|
|
||||||
-X, --print-stats List stats on the parsing process.
|
|
||||||
-Y, --save-stats Save stats to a file.
|
|
||||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
|
||||||
|
|
||||||
camelot methods:
|
|
||||||
lattice Looks for lines between data.
|
|
||||||
stream Looks for spaces between data.
|
|
||||||
|
|
||||||
See 'camelot <method> -h' for more information on a specific method.
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
Currently, camelot works under Python 2.7.
|
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
|
||||||
|
|
||||||
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/).
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
|
@ -78,22 +37,22 @@ pip install -U pip setuptools
|
||||||
|
|
||||||
### Installing dependencies
|
### Installing dependencies
|
||||||
|
|
||||||
numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager.
|
tk and ghostscript can be installed using your system's default package manager.
|
||||||
|
|
||||||
#### Linux
|
#### Linux
|
||||||
|
|
||||||
* Arch Linux
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
sudo pacman -S opencv tk ghostscript
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
* Ubuntu
|
* Ubuntu
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
sudo apt-get install python-opencv python-tk ghostscript
|
sudo apt-get install python-opencv python-tk ghostscript
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
* Arch Linux
|
||||||
|
|
||||||
|
<pre>
|
||||||
|
sudo pacman -S opencv tk ghostscript
|
||||||
|
</pre>
|
||||||
|
|
||||||
#### OS X
|
#### OS X
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
|
|
@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
|
||||||
Finally, `cd` into the project directory and install by
|
Finally, `cd` into the project directory and install by
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
make install
|
python setup.py install
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
@ -118,14 +77,14 @@ git clone https://github.com/socialcopsdev/camelot.git
|
||||||
|
|
||||||
### Contributing
|
### Contributing
|
||||||
|
|
||||||
See [Contributing doc]().
|
See [Contributing guidelines]().
|
||||||
|
|
||||||
### Testing
|
### Testing
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
make test
|
python setup.py test
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
BSD License
|
BSD License
|
||||||
186
camelot/core.py
186
camelot/core.py
|
|
@ -8,9 +8,48 @@ import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
class Cell(object):
|
class Cell(object):
|
||||||
"""
|
"""Defines a cell in a table with coordinates relative to a
|
||||||
|
left-bottom origin. (pdf coordinate space)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x1 : float
|
||||||
|
x-coordinate of left-bottom point.
|
||||||
|
y1 : float
|
||||||
|
y-coordinate of left-bottom point.
|
||||||
|
x2 : float
|
||||||
|
x-coordinate of right-top point.
|
||||||
|
y2 : float
|
||||||
|
y-coordinate of right-top point.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
lb : tuple
|
||||||
|
Tuple representing left-bottom coordinates.
|
||||||
|
lt : tuple
|
||||||
|
Tuple representing left-top coordinates.
|
||||||
|
rb : tuple
|
||||||
|
Tuple representing right-bottom coordinates.
|
||||||
|
rt : tuple
|
||||||
|
Tuple representing right-top coordinates.
|
||||||
|
left : bool
|
||||||
|
Whether or not cell is bounded on the left.
|
||||||
|
right : bool
|
||||||
|
Whether or not cell is bounded on the right.
|
||||||
|
top : bool
|
||||||
|
Whether or not cell is bounded on the top.
|
||||||
|
bottom : bool
|
||||||
|
Whether or not cell is bounded on the bottom.
|
||||||
|
hspan : bool
|
||||||
|
Whether or not cell spans horizontally.
|
||||||
|
vspan : bool
|
||||||
|
Whether or not cell spans vertically.
|
||||||
|
text : string
|
||||||
|
Text assigned to cell.
|
||||||
|
bound
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, x1, y1, x2, y2):
|
def __init__(self, x1, y1, x2, y2):
|
||||||
self.x1 = x1
|
self.x1 = x1
|
||||||
self.y1 = y1
|
self.y1 = y1
|
||||||
|
|
@ -34,37 +73,48 @@ class Cell(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
@text.setter
|
@text.setter
|
||||||
def text(self, t):
|
def text(self, t):
|
||||||
"""
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t
|
|
||||||
"""
|
|
||||||
self._text = ''.join([self._text, t])
|
self._text = ''.join([self._text, t])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bound(self):
|
def bound(self):
|
||||||
"""
|
"""The number of sides on which the cell is bounded.
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return self.top + self.bottom + self.left + self.right
|
return self.top + self.bottom + self.left + self.right
|
||||||
|
|
||||||
|
|
||||||
class Table(object):
|
class Table(object):
|
||||||
"""
|
"""Defines a table with coordinates relative to a left-bottom
|
||||||
|
origin. (pdf coordinate space)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of tuples representing column x-coordinates in increasing
|
||||||
|
order.
|
||||||
|
rows : list
|
||||||
|
List of tuples representing row y-coordinates in decreasing
|
||||||
|
order.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
df : object
|
||||||
|
pandas.DataFrame
|
||||||
|
shape : tuple
|
||||||
|
Shape of the table.
|
||||||
|
accuracy : float
|
||||||
|
Accuracy with which text was assigned to the cell.
|
||||||
|
whitespace : float
|
||||||
|
Percentage of whitespace in the table.
|
||||||
|
order : int
|
||||||
|
Table number on pdf page.
|
||||||
|
page : int
|
||||||
|
Pdf page number.
|
||||||
|
data
|
||||||
|
parsing_report
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
|
|
@ -84,11 +134,7 @@ class Table(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def data(self):
|
def data(self):
|
||||||
"""
|
"""Returns two-dimensional list of strings in table.
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
d = []
|
d = []
|
||||||
for row in self.cells:
|
for row in self.cells:
|
||||||
|
|
@ -97,11 +143,8 @@ class Table(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parsing_report(self):
|
def parsing_report(self):
|
||||||
"""
|
"""Returns a parsing report with accuracy, %whitespace,
|
||||||
|
table number on page and page number.
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# pretty?
|
# pretty?
|
||||||
report = {
|
report = {
|
||||||
|
|
@ -112,27 +155,8 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
def set_border(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
for r in range(len(self.rows)):
|
|
||||||
self.cells[r][0].left = True
|
|
||||||
self.cells[r][len(self.cols) - 1].right = True
|
|
||||||
for c in range(len(self.cols)):
|
|
||||||
self.cells[0][c].top = True
|
|
||||||
self.cells[len(self.rows) - 1][c].bottom = True
|
|
||||||
return self
|
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_all_edges(self):
|
||||||
"""
|
"""Sets all table edges to True.
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for row in self.cells:
|
for row in self.cells:
|
||||||
for cell in row:
|
for cell in row:
|
||||||
|
|
@ -140,16 +164,16 @@ class Table(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, joint_close_tol=2):
|
def set_edges(self, vertical, horizontal, joint_close_tol=2):
|
||||||
"""
|
"""Sets a cell's edges to True depending on whether the cell's
|
||||||
|
coordinates overlap with the line's coordinates within a
|
||||||
|
tolerance.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
vertical
|
vertical : list
|
||||||
horizontal
|
List of detected vertical lines.
|
||||||
joint_close_tol
|
horizontal : list
|
||||||
|
List of detected horizontal lines.
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
|
|
@ -256,12 +280,20 @@ class Table(object):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_span(self):
|
def set_border(self):
|
||||||
|
"""Sets table border edges to True.
|
||||||
"""
|
"""
|
||||||
|
for r in range(len(self.rows)):
|
||||||
|
self.cells[r][0].left = True
|
||||||
|
self.cells[r][len(self.cols) - 1].right = True
|
||||||
|
for c in range(len(self.cols)):
|
||||||
|
self.cells[0][c].top = True
|
||||||
|
self.cells[len(self.rows) - 1][c].bottom = True
|
||||||
|
return self
|
||||||
|
|
||||||
Returns
|
def set_span(self):
|
||||||
-------
|
"""Sets a cell's hspan or vspan attribute to True depending
|
||||||
|
on whether the cell spans horizontally or vertically.
|
||||||
"""
|
"""
|
||||||
for row in self.cells:
|
for row in self.cells:
|
||||||
for cell in row:
|
for cell in row:
|
||||||
|
|
@ -288,6 +320,8 @@ class Table(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_csv(self, path, **kwargs):
|
def to_csv(self, path, **kwargs):
|
||||||
|
"""Write Table to a comma-separated values (csv) file.
|
||||||
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'encoding': 'utf-8',
|
'encoding': 'utf-8',
|
||||||
'index': False,
|
'index': False,
|
||||||
|
|
@ -297,6 +331,8 @@ class Table(object):
|
||||||
self.df.to_csv(path, **kw)
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
def to_json(self, path, **kwargs):
|
def to_json(self, path, **kwargs):
|
||||||
|
"""Write Table to a JSON file.
|
||||||
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'orient': 'records'
|
'orient': 'records'
|
||||||
}
|
}
|
||||||
|
|
@ -306,6 +342,8 @@ class Table(object):
|
||||||
f.write(json_string)
|
f.write(json_string)
|
||||||
|
|
||||||
def to_excel(self, path, **kwargs):
|
def to_excel(self, path, **kwargs):
|
||||||
|
"""Write Table to an Excel file.
|
||||||
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||||
'encoding': 'utf-8'
|
'encoding': 'utf-8'
|
||||||
|
|
@ -316,13 +354,21 @@ class Table(object):
|
||||||
writer.save()
|
writer.save()
|
||||||
|
|
||||||
def to_html(self, path, **kwargs):
|
def to_html(self, path, **kwargs):
|
||||||
|
"""Write Table to an HTML file.
|
||||||
|
"""
|
||||||
html_string = self.df.to_html(**kwargs)
|
html_string = self.df.to_html(**kwargs)
|
||||||
with open(path, 'w') as f:
|
with open(path, 'w') as f:
|
||||||
f.write(html_string)
|
f.write(html_string)
|
||||||
|
|
||||||
|
|
||||||
class TableList(object):
|
class TableList(object):
|
||||||
"""
|
"""Defines a list of camelot.core.Table objects. Each table can
|
||||||
|
be accessed using its index.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
n : int
|
||||||
|
Number of tables in the list.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, tables):
|
def __init__(self, tables):
|
||||||
|
|
@ -371,6 +417,18 @@ class TableList(object):
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
def export(self, path, f='csv', compress=False):
|
def export(self, path, f='csv', compress=False):
|
||||||
|
"""Exports the list of tables to specified file format.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Filepath
|
||||||
|
f : str
|
||||||
|
File format. Can be csv, json, excel and html.
|
||||||
|
compress : bool
|
||||||
|
Whether or not to add files to a ZIP archive.
|
||||||
|
|
||||||
|
"""
|
||||||
dirname = os.path.dirname(path)
|
dirname = os.path.dirname(path)
|
||||||
basename = os.path.basename(path)
|
basename = os.path.basename(path)
|
||||||
root, ext = os.path.splitext(basename)
|
root, ext = os.path.splitext(basename)
|
||||||
|
|
@ -402,9 +460,6 @@ class TableList(object):
|
||||||
|
|
||||||
|
|
||||||
class Geometry(object):
|
class Geometry(object):
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.text = []
|
self.text = []
|
||||||
self.images = ()
|
self.images = ()
|
||||||
|
|
@ -421,9 +476,6 @@ class Geometry(object):
|
||||||
|
|
||||||
|
|
||||||
class GeometryList(object):
|
class GeometryList(object):
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self, geometry):
|
def __init__(self, geometry):
|
||||||
self.text = [g.text for g in geometry]
|
self.text = [g.text for g in geometry]
|
||||||
self.images = [g.images for g in geometry]
|
self.images = [g.images for g in geometry]
|
||||||
|
|
|
||||||
|
|
@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
"""
|
"""Handles all operations like temp directory creation, splitting
|
||||||
|
file into single page pdfs, parsing each pdf and then removing the
|
||||||
|
temp directory.
|
||||||
|
|
||||||
|
Parameter
|
||||||
|
---------
|
||||||
|
filename : str
|
||||||
|
Path to pdf file.
|
||||||
|
pages : str
|
||||||
|
Comma-separated page numbers to parse.
|
||||||
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, pages='1'):
|
def __init__(self, filename, pages='1'):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
if not self.filename.endswith('.pdf'):
|
if not self.filename.endswith('.pdf'):
|
||||||
raise TypeError("File format not supported.")
|
raise TypeError("File format not supported.")
|
||||||
self.pages = self.__get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filename, pages)
|
||||||
self.tempdir = tempfile.mkdtemp()
|
self.tempdir = tempfile.mkdtemp()
|
||||||
|
|
||||||
def __get_pages(self, filename, pages):
|
def _get_pages(self, filename, pages):
|
||||||
# refactor
|
"""Converts pages string to list of ints.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
Path to pdf file.
|
||||||
|
pages : str
|
||||||
|
Comma-separated page numbers to parse.
|
||||||
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
P : list
|
||||||
|
List of int page numbers.
|
||||||
|
|
||||||
|
"""
|
||||||
page_numbers = []
|
page_numbers = []
|
||||||
if pages == '1':
|
if pages == '1':
|
||||||
page_numbers.append({'start': 1, 'end': 1})
|
page_numbers.append({'start': 1, 'end': 1})
|
||||||
|
|
@ -42,8 +67,19 @@ class PDFHandler(object):
|
||||||
P.extend(range(p['start'], p['end'] + 1))
|
P.extend(range(p['start'], p['end'] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def __save_page(self, filename, page, temp):
|
def _save_page(self, filename, page, temp):
|
||||||
# refactor
|
"""Saves specified page from pdf into a temporary directory.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
Path to pdf file.
|
||||||
|
page : int
|
||||||
|
Page number
|
||||||
|
temp : str
|
||||||
|
Tmp directory
|
||||||
|
|
||||||
|
"""
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filename, 'rb') as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||||
|
|
@ -65,28 +101,37 @@ class PDFHandler(object):
|
||||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
p = infile.getPage(0)
|
p = infile.getPage(0)
|
||||||
if rotation == 'left':
|
if rotation == 'anticlockwise':
|
||||||
p.rotateClockwise(90)
|
p.rotateClockwise(90)
|
||||||
elif rotation == 'right':
|
elif rotation == 'clockwise':
|
||||||
p.rotateCounterClockwise(90)
|
p.rotateCounterClockwise(90)
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, mesh=False, **kwargs):
|
def parse(self, mesh=False, **kwargs):
|
||||||
"""
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
|
page pdfs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
mesh
|
mesh : bool (default: False)
|
||||||
kwargs
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
|
is used by default.
|
||||||
|
kwargs : dict
|
||||||
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
tables : camelot.core.TableList
|
||||||
|
List of tables found in pdf.
|
||||||
|
geometry : camelot.core.GeometryList
|
||||||
|
List of geometry objects (contours, lines, joints)
|
||||||
|
found in pdf.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self.__save_page(self.filename, p, self.tempdir)
|
self._save_page(self.filename, p, self.tempdir)
|
||||||
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
tables = []
|
tables = []
|
||||||
|
|
|
||||||
|
|
@ -9,17 +9,31 @@ from .utils import merge_tuples
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
"""
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
imagename
|
imagename : string
|
||||||
process_background
|
Path to image file.
|
||||||
blocksize
|
process_background : bool, optional (default: False)
|
||||||
c
|
Whether or not to process lines that are in background.
|
||||||
|
blocksize : int, optional (default: 15)
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
c : int, optional (default: -2)
|
||||||
|
Constant subtracted from the mean or weighted mean.
|
||||||
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
img : object
|
||||||
|
numpy.ndarray representing the original image.
|
||||||
|
threshold : object
|
||||||
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img = cv2.imread(imagename)
|
img = cv2.imread(imagename)
|
||||||
|
|
@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
|
|
||||||
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
||||||
"""
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
|
transformations on an image.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold
|
threshold : object
|
||||||
direction
|
numpy.ndarray representing the thresholded image.
|
||||||
line_size_scaling
|
direction : string, optional (default: 'horizontal')
|
||||||
iterations
|
Specifies whether to find vertical or horizontal lines.
|
||||||
|
line_size_scaling : int, optional (default: 15)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
iterations : int, optional (default: 0)
|
||||||
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
dmask : object
|
||||||
|
numpy.ndarray representing pixels where vertical/horizontal
|
||||||
|
lines lie.
|
||||||
|
lines : list
|
||||||
|
List of tuples representing vertical/horizontal lines with
|
||||||
|
coordinates relative to a left-top origin in
|
||||||
|
image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lines = []
|
lines = []
|
||||||
|
|
@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
||||||
|
|
||||||
|
|
||||||
def find_table_contours(vertical, horizontal):
|
def find_table_contours(vertical, horizontal):
|
||||||
"""
|
"""Finds table boundaries using OpenCV's findContours.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
vertical
|
vertical : object
|
||||||
horizontal
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
|
horizontal : object
|
||||||
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
cont : list
|
||||||
|
List of tuples representing table boundaries. Each tuple is of
|
||||||
|
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||||
|
h -> height in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
mask = vertical + horizontal
|
mask = vertical + horizontal
|
||||||
|
|
@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal):
|
||||||
|
|
||||||
|
|
||||||
def find_table_joints(contours, vertical, horizontal):
|
def find_table_joints(contours, vertical, horizontal):
|
||||||
"""
|
"""Finds joints/intersections present inside each table boundary.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
contours
|
contours : list
|
||||||
vertical
|
List of tuples representing table boundaries. Each tuple is of
|
||||||
horizontal
|
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||||
|
h -> height in image coordinate space.
|
||||||
|
vertical : object
|
||||||
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
|
horizontal : object
|
||||||
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
tables : dict
|
||||||
|
Dict with table boundaries as keys and list of intersections
|
||||||
|
in that boundary as their value.
|
||||||
|
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||||
|
and (x2, y2) -> rt in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
joints = np.bitwise_and(vertical, horizontal)
|
joints = np.bitwise_and(vertical, horizontal)
|
||||||
|
|
@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal):
|
||||||
|
|
||||||
|
|
||||||
def remove_lines(threshold, line_size_scaling=15):
|
def remove_lines(threshold, line_size_scaling=15):
|
||||||
"""
|
"""Removes lines from a thresholded image.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold
|
threshold : object
|
||||||
line_size_scaling
|
numpy.ndarray representing the thresholded image.
|
||||||
|
line_size_scaling : int, optional (default: 15)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
threshold : object
|
||||||
|
numpy.ndarray representing the thresholded image
|
||||||
|
with horizontal and vertical lines removed.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
size = threshold.shape[0] // line_size_scaling
|
size = threshold.shape[0] // line_size_scaling
|
||||||
|
|
@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15):
|
||||||
|
|
||||||
|
|
||||||
def find_cuts(threshold, char_size_scaling=200):
|
def find_cuts(threshold, char_size_scaling=200):
|
||||||
"""
|
"""Finds cuts made by text projections on y-axis.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold
|
threshold : object
|
||||||
char_size_scaling
|
numpy.ndarray representing the thresholded image.
|
||||||
|
line_size_scaling : int, optional (default: 200)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
y_cuts : list
|
||||||
|
List of cuts on y-axis.
|
||||||
"""
|
"""
|
||||||
size = threshold.shape[0] // char_size_scaling
|
size = threshold.shape[0] // char_size_scaling
|
||||||
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
|
|
|
||||||
|
|
@ -2,20 +2,93 @@ from .handlers import PDFHandler
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
"""
|
"""Read PDF and return parsed data tables.
|
||||||
|
|
||||||
|
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||||
|
and kwargs annotated with * can only be used with mesh=True.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath
|
filepath : str
|
||||||
pages
|
Path to pdf file.
|
||||||
mesh
|
pages : str
|
||||||
kwargs
|
Comma-separated page numbers to parse.
|
||||||
|
Example: 1,3,4 or 1,4-end
|
||||||
|
mesh : bool (default: False)
|
||||||
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
|
is used by default.
|
||||||
|
table_area : list, optional (default: None)
|
||||||
|
List of table areas to analyze as strings of the form
|
||||||
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
|
columns^ : list, optional (default: None)
|
||||||
|
List of column x-coordinates as strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
row_close_tol^ : int, optional (default: 2)
|
||||||
|
Rows will be formed by combining text vertically
|
||||||
|
within this tolerance.
|
||||||
|
col_close_tol^ : int, optional (default: 0)
|
||||||
|
Columns will be formed by combining text horizontally
|
||||||
|
within this tolerance.
|
||||||
|
process_background* : bool, optional (default: False)
|
||||||
|
Whether or not to process lines that are in background.
|
||||||
|
line_size_scaling* : int, optional (default: 15)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
copy_text* : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
shift_text* : list, optional (default: ['l', 't'])
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify where the text in a spanning cell should flow.
|
||||||
|
line_close_tol* : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to merge vertical and horizontal
|
||||||
|
detected lines which lie close to each other.
|
||||||
|
joint_close_tol* : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to decide whether the detected lines
|
||||||
|
and points lie close to each other.
|
||||||
|
threshold_blocksize : int, optional (default: 15)
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
threshold_constant : int, optional (default: -2)
|
||||||
|
Constant subtracted from the mean or weighted mean.
|
||||||
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
iterations : int, optional (default: 0)
|
||||||
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
|
margins : tuple
|
||||||
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether or not to return all text objects on the page
|
||||||
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
values for table_area(s) and debugging.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# explicit type conversion
|
# validate kwargs?
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
tables, __ = p.parse(mesh=mesh, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""
|
"""Defines a base parser.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def _generate_layout(self, filename):
|
def _generate_layout(self, filename):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from .base import BaseParser
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||||
merge_close_lines, get_table_index, compute_accuracy,
|
merge_close_lines, get_table_index, compute_accuracy,
|
||||||
count_empty_strings, encode_, setup_logging)
|
compute_whitespace, setup_logging, encode_)
|
||||||
from ..image_processing import (adaptive_threshold, find_lines,
|
from ..image_processing import (adaptive_threshold, find_lines,
|
||||||
find_table_contours, find_table_joints)
|
find_table_contours, find_table_joints)
|
||||||
|
|
||||||
|
|
@ -20,14 +20,74 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""
|
"""Lattice method of parsing looks for lines between text
|
||||||
|
to form a table.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_area : list, optional (default: None)
|
||||||
|
List of table areas to analyze as strings of the form
|
||||||
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
|
process_background : bool, optional (default: False)
|
||||||
|
Whether or not to process lines that are in background.
|
||||||
|
line_size_scaling : int, optional (default: 15)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
copy_text : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
shift_text : list, optional (default: ['l', 't'])
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify where the text in a spanning cell should flow.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
line_close_tol : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to merge vertical and horizontal
|
||||||
|
detected lines which lie close to each other.
|
||||||
|
joint_close_tol : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to decide whether the detected lines
|
||||||
|
and points lie close to each other.
|
||||||
|
threshold_blocksize : int, optional (default: 15)
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
threshold_constant : int, optional (default: -2)
|
||||||
|
Constant subtracted from the mean or weighted mean.
|
||||||
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
iterations : int, optional (default: 0)
|
||||||
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
|
margins : tuple
|
||||||
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether or not to return all text objects on the page
|
||||||
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
values for table_area(s) and debugging.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, process_background=False,
|
def __init__(self, table_area=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1), debug=None):
|
iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
|
|
@ -45,6 +105,27 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
cell.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
idx : list
|
||||||
|
List of tuples of the form (r_idx, c_idx, text).
|
||||||
|
shift_text : list
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more strings from above and pass them as a
|
||||||
|
list to specify where the text in a spanning cell should
|
||||||
|
flow.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
indices : list
|
||||||
|
List of tuples of the form (r_idx, c_idx, text) where
|
||||||
|
r_idx and c_idx are new row and column indices for text.
|
||||||
|
|
||||||
|
"""
|
||||||
indices = []
|
indices = []
|
||||||
for r_idx, c_idx, text in idx:
|
for r_idx, c_idx, text in idx:
|
||||||
for d in shift_text:
|
for d in shift_text:
|
||||||
|
|
@ -69,6 +150,22 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _copy_spanning_text(t, copy_text=None):
|
def _copy_spanning_text(t, copy_text=None):
|
||||||
|
"""Copies over text in empty spanning cells.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : camelot.core.Table
|
||||||
|
copy_text : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
for f in copy_text:
|
for f in copy_text:
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
|
|
@ -199,7 +296,7 @@ class Lattice(BaseParser):
|
||||||
table.df = pd.DataFrame(data)
|
table.df = pd.DataFrame(data)
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace, __, __ = count_empty_strings(data)
|
whitespace = compute_whitespace(data)
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
|
|
@ -208,16 +305,6 @@ class Lattice(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
"""
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
filename
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename)
|
||||||
|
|
||||||
|
|
@ -237,7 +324,7 @@ class Lattice(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug:
|
||||||
text = []
|
text = []
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
|
|
||||||
|
|
@ -8,19 +8,54 @@ import pandas as pd
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||||
count_empty_strings, encode_, setup_logging)
|
compute_whitespace, setup_logging, encode_)
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logging(__name__)
|
logger = setup_logging(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
"""
|
"""Stream method of parsing looks for spaces between text
|
||||||
|
to form a table.
|
||||||
|
|
||||||
|
If you want to specify columns when specifying multiple table
|
||||||
|
areas, make sure that the length of both lists are equal.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_area : list, optional (default: None)
|
||||||
|
List of table areas to analyze as strings of the form
|
||||||
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
|
columns : list, optional (default: None)
|
||||||
|
List of column x-coordinates as strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
row_close_tol : int, optional (default: 2)
|
||||||
|
Rows will be formed by combining text vertically
|
||||||
|
within this tolerance.
|
||||||
|
col_close_tol : int, optional (default: 0)
|
||||||
|
Columns will be formed by combining text horizontally
|
||||||
|
within this tolerance.
|
||||||
|
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||||
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether or not to return all text objects on the page
|
||||||
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
values for table_area(s), columns and debugging.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
def __init__(self, table_area=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||||
margins=(1.0, 0.5, 0.1), debug=None):
|
margins=(1.0, 0.5, 0.1), debug=False):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -33,6 +68,20 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _text_bbox(t_bbox):
|
def _text_bbox(t_bbox):
|
||||||
|
"""Returns bounding box for the text present on a page.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||||
|
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text_bbox : tuple
|
||||||
|
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||||
|
|
||||||
|
"""
|
||||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||||
|
|
@ -42,6 +91,21 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _group_rows(text, row_close_tol=2):
|
def _group_rows(text, row_close_tol=2):
|
||||||
|
"""Groups PDFMiner text objects into rows vertically
|
||||||
|
within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
row_close_tol : int, optional (default: 2)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
|
||||||
|
"""
|
||||||
row_y = 0
|
row_y = 0
|
||||||
rows = []
|
rows = []
|
||||||
temp = []
|
temp = []
|
||||||
|
|
@ -61,6 +125,21 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _merge_columns(l, col_close_tol=0):
|
def _merge_columns(l, col_close_tol=0):
|
||||||
|
"""Merges column boundaries horizontally if they overlap
|
||||||
|
or lie within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
l : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
col_close_tol : int, optional (default: 0)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
List of merged column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
merged = []
|
merged = []
|
||||||
for higher in l:
|
for higher in l:
|
||||||
if not merged:
|
if not merged:
|
||||||
|
|
@ -89,6 +168,21 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||||
|
"""Makes row coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
rows_grouped : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
text_y_max : int
|
||||||
|
text_y_min : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
List of continuous row y-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||||
if len(r) > 0 else 0 for r in rows_grouped]
|
if len(r) > 0 else 0 for r in rows_grouped]
|
||||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||||
|
|
@ -100,6 +194,23 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_columns(cols, text, row_close_tol):
|
def _add_columns(cols, text, row_close_tol):
|
||||||
|
"""Adds columns to existing list by taking into account
|
||||||
|
the text that lies outside the current column x-coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
ytol : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
if text:
|
if text:
|
||||||
text = Stream._group_rows(text, row_close_tol=row_close_tol)
|
text = Stream._group_rows(text, row_close_tol=row_close_tol)
|
||||||
elements = [len(r) for r in text]
|
elements = [len(r) for r in text]
|
||||||
|
|
@ -110,6 +221,21 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _join_columns(cols, text_x_min, text_x_max):
|
def _join_columns(cols, text_x_min, text_x_max):
|
||||||
|
"""Makes column coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text_x_min : int
|
||||||
|
text_y_max : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
cols = sorted(cols)
|
cols = sorted(cols)
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
cols.insert(0, text_x_min)
|
cols.insert(0, text_x_min)
|
||||||
|
|
@ -207,7 +333,7 @@ class Stream(BaseParser):
|
||||||
table.df = pd.DataFrame(data)
|
table.df = pd.DataFrame(data)
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace, __, __ = count_empty_strings(data)
|
whitespace = compute_whitespace(data)
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
|
|
@ -216,16 +342,6 @@ class Stream(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
"""
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
filename
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename)
|
||||||
|
|
||||||
|
|
@ -244,7 +360,7 @@ class Stream(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug:
|
||||||
text = []
|
text = []
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
|
|
||||||
|
|
@ -6,19 +6,101 @@ from .handlers import PDFHandler
|
||||||
|
|
||||||
|
|
||||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
||||||
"""
|
"""Plot geometry found on pdf page based on type specified,
|
||||||
|
useful for debugging and playing with different parameters to get
|
||||||
|
the best output.
|
||||||
|
|
||||||
|
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||||
|
and kwargs annotated with * can only be used with mesh=True.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath
|
filepath : str
|
||||||
pages
|
Path to pdf file.
|
||||||
mesh
|
pages : str
|
||||||
geometry_type
|
Comma-separated page numbers to parse.
|
||||||
kwargs
|
Example: 1,3,4 or 1,4-end
|
||||||
|
mesh : bool (default: False)
|
||||||
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
|
is used by default.
|
||||||
|
geometry_type : str, optional (default: 'text')
|
||||||
|
'text' : Plot text objects found on page, useful to get
|
||||||
|
table_area and columns coordinates.
|
||||||
|
'table' : Plot parsed table.
|
||||||
|
'contour'* : Plot detected rectangles.
|
||||||
|
'joint'* : Plot detected line intersections.
|
||||||
|
'line'* : Plot detected lines.
|
||||||
|
table_area : list, optional (default: None)
|
||||||
|
List of table areas to analyze as strings of the form
|
||||||
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
|
columns^ : list, optional (default: None)
|
||||||
|
List of column x-coordinates as strings where the coordinates
|
||||||
|
are comma-separated.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
row_close_tol^ : int, optional (default: 2)
|
||||||
|
Rows will be formed by combining text vertically
|
||||||
|
within this tolerance.
|
||||||
|
col_close_tol^ : int, optional (default: 0)
|
||||||
|
Columns will be formed by combining text horizontally
|
||||||
|
within this tolerance.
|
||||||
|
process_background* : bool, optional (default: False)
|
||||||
|
Whether or not to process lines that are in background.
|
||||||
|
line_size_scaling* : int, optional (default: 15)
|
||||||
|
Factor by which the page dimensions will be divided to get
|
||||||
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
The larger this value, smaller the detected lines. Making it
|
||||||
|
too large will lead to text being detected as lines.
|
||||||
|
copy_text* : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
shift_text* : list, optional (default: ['l', 't'])
|
||||||
|
{'l', 'r', 't', 'b'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify where the text in a spanning cell should flow.
|
||||||
|
line_close_tol* : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to merge vertical and horizontal
|
||||||
|
detected lines which lie close to each other.
|
||||||
|
joint_close_tol* : int, optional (default: 2)
|
||||||
|
Tolerance parameter used to decide whether the detected lines
|
||||||
|
and points lie close to each other.
|
||||||
|
threshold_blocksize : int, optional (default: 15)
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
threshold_constant : int, optional (default: -2)
|
||||||
|
Constant subtracted from the mean or weighted mean.
|
||||||
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
|
iterations : int, optional (default: 0)
|
||||||
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
|
margins : tuple
|
||||||
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether or not to return all text objects on the page
|
||||||
|
which can be used to generate a matplotlib plot, to get
|
||||||
|
values for table_area(s) and debugging.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# explicit type conversion
|
# validate kwargs?
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
kwargs.update({'debug': geometry_type})
|
debug = True if geometry_type else False
|
||||||
|
kwargs.update({'debug': debug})
|
||||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
__, geometry = p.parse(mesh=mesh, **kwargs)
|
||||||
|
|
||||||
if geometry_type == 'text':
|
if geometry_type == 'text':
|
||||||
|
|
|
||||||
360
camelot/utils.py
360
camelot/utils.py
|
|
@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(name):
|
def setup_logging(name):
|
||||||
"""
|
"""Sets up a logger with StreamHandler.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
name
|
name : str
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
logger : logging.Logger
|
||||||
|
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger(name)
|
logger = logging.getLogger(name)
|
||||||
|
|
@ -47,15 +48,16 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
"""
|
"""Translates x2 by x1.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x1
|
x1 : float
|
||||||
x2
|
x2 : float
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
x2 : float
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x2 += x1
|
x2 += x1
|
||||||
|
|
@ -63,15 +65,16 @@ def translate(x1, x2):
|
||||||
|
|
||||||
|
|
||||||
def scale(x, s):
|
def scale(x, s):
|
||||||
"""
|
"""Scales x by scaling factor s.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x
|
x : float
|
||||||
s
|
s : float
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
x : float
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x *= s
|
x *= s
|
||||||
|
|
@ -79,18 +82,21 @@ def scale(x, s):
|
||||||
|
|
||||||
|
|
||||||
def rotate(x1, y1, x2, y2, angle):
|
def rotate(x1, y1, x2, y2, angle):
|
||||||
"""
|
"""Rotates point x2, y2 about point x1, y1 by angle.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x1
|
x1 : float
|
||||||
y1
|
y1 : float
|
||||||
x2
|
x2 : float
|
||||||
y2
|
y2 : float
|
||||||
angle
|
angle : float
|
||||||
|
Angle in radians.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
xnew : float
|
||||||
|
ynew : float
|
||||||
|
|
||||||
"""
|
"""
|
||||||
s = np.sin(angle)
|
s = np.sin(angle)
|
||||||
|
|
@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle):
|
||||||
|
|
||||||
|
|
||||||
def scale_pdf(k, factors):
|
def scale_pdf(k, factors):
|
||||||
"""
|
"""Translates and scales pdf coordinate space to image
|
||||||
|
coordinate space.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
k
|
k : tuple
|
||||||
factors
|
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||||
|
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
|
||||||
|
space.
|
||||||
|
factors : tuple
|
||||||
|
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
|
||||||
|
first two elements are scaling factors and pdf_y is height of
|
||||||
|
pdf.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
knew : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||||
|
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
|
||||||
|
space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x1, y1, x2, y2 = k
|
x1, y1, x2, y2 = k
|
||||||
|
|
@ -127,17 +144,28 @@ def scale_pdf(k, factors):
|
||||||
|
|
||||||
|
|
||||||
def scale_image(tables, v_segments, h_segments, factors):
|
def scale_image(tables, v_segments, h_segments, factors):
|
||||||
"""
|
"""Translates and scales image coordinate space to pdf
|
||||||
|
coordinate space.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
tables
|
tables : dict
|
||||||
v_segments
|
Dict with table boundaries as keys and list of intersections
|
||||||
h_segments
|
in that boundary as value.
|
||||||
factors
|
v_segments : list
|
||||||
|
List of vertical line segments.
|
||||||
|
h_segments : list
|
||||||
|
List of horizontal line segments.
|
||||||
|
factors : tuple
|
||||||
|
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
|
||||||
|
first two elements are scaling factors and img_y is height of
|
||||||
|
image.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
tables_new : dict
|
||||||
|
v_segments_new : dict
|
||||||
|
h_segments_new : dict
|
||||||
|
|
||||||
"""
|
"""
|
||||||
scaling_factor_x, scaling_factor_y, img_y = factors
|
scaling_factor_x, scaling_factor_y, img_y = factors
|
||||||
|
|
@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
|
|
||||||
|
|
||||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
"""
|
"""Detects if text in table is rotated or not using the current
|
||||||
|
transformation matrix (CTM) and returns its orientation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
lttextlh
|
lttextlh : list
|
||||||
lttextlv
|
List of PDFMiner LTTextLineHorizontal objects.
|
||||||
ltchar
|
lttextlv : list
|
||||||
|
List of PDFMiner LTTextLineVertical objects.
|
||||||
|
ltchar : list
|
||||||
|
List of PDFMiner LTChar objects.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
rotation : string
|
||||||
|
'' if text in table is upright, 'left' if rotated 90 degree
|
||||||
|
anticlockwise and 'right' if rotated 90 degree clockwise.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
rotation = ''
|
rotation = ''
|
||||||
|
|
@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||||
rotation = 'left' if clockwise < anticlockwise else 'right'
|
rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise'
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
||||||
def segments_in_bbox(bbox, v_segments, h_segments):
|
def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
"""
|
"""Returns all line segments present inside a bounding box.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
bbox
|
bbox : tuple
|
||||||
v_segments
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
h_segments
|
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||||
|
space.
|
||||||
|
v_segments : list
|
||||||
|
List of vertical line segments.
|
||||||
|
h_segments : list
|
||||||
|
List of vertical horizontal segments.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
v_s : list
|
||||||
|
List of vertical line segments that lie inside table.
|
||||||
|
h_s : list
|
||||||
|
List of horizontal line segments that lie inside table.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
|
|
@ -217,35 +261,42 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""
|
"""Returns all text objects present inside a bounding box.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
bbox
|
bbox : tuple
|
||||||
text
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||||
|
space.
|
||||||
|
text : List of PDFMiner text objects.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
t_bbox : list
|
||||||
|
List of PDFMiner text objects that lie inside table.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
||||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
||||||
<= rt[1] + 2]
|
<= rt[1] + 2]
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def remove_close_lines(ar, line_close_tol=2):
|
def remove_close_lines(ar, line_close_tol=2):
|
||||||
"""
|
"""Removes lines which are within a tolerance, based on their x or
|
||||||
|
y axis projections.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
ar
|
ar : list
|
||||||
line_close_tol
|
line_close_tol : int, optional (default: 2)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
ret : list
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ret = []
|
ret = []
|
||||||
|
|
@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2):
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_close_tol=2):
|
def merge_close_lines(ar, line_close_tol=2):
|
||||||
"""
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
ar
|
ar : list
|
||||||
line_close_tol
|
line_close_tol : int, optional (default: 2)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
ret : list
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ret = []
|
ret = []
|
||||||
|
|
@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2):
|
||||||
|
|
||||||
|
|
||||||
def flag_font_size(textline, direction):
|
def flag_font_size(textline, direction):
|
||||||
"""
|
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||||
|
May give false positives.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
textline
|
textline : list
|
||||||
direction
|
List of PDFMiner LTChar objects.
|
||||||
|
direction : string
|
||||||
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
fstring : string
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if direction == 'horizontal':
|
if direction == 'horizontal':
|
||||||
|
|
@ -324,18 +381,27 @@ def flag_font_size(textline, direction):
|
||||||
return fstring
|
return fstring
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=True):
|
def split_textline(table, textline, direction, flag_size=False):
|
||||||
"""
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
|
multiple rows/columns.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table
|
table : camelot.core.Table
|
||||||
textline
|
textline : object
|
||||||
direction
|
PDFMiner LTTextLine object.
|
||||||
flag_size
|
direction : string
|
||||||
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
grouped_chars : list
|
||||||
|
List of tuples of the form (idx, text) where idx is the index
|
||||||
|
of row/column and text is the an lttextline substring.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
idx = 0
|
idx = 0
|
||||||
|
|
@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True):
|
||||||
return grouped_chars
|
return grouped_chars
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
"""
|
"""Gets indices of the table cell where given text object lies by
|
||||||
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table
|
table : camelot.core.Table
|
||||||
t
|
t : object
|
||||||
direction
|
PDFMiner LTTextLine object.
|
||||||
split_text
|
direction : string
|
||||||
flag_size
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
split_text : bool, optional (default: False)
|
||||||
|
Whether or not to split a text line if it spans across
|
||||||
|
multiple cells.
|
||||||
|
flag_size : bool, optional (default: False)
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
indices : list
|
||||||
|
List of tuples of the form (r_idx, c_idx, text) where r_idx
|
||||||
|
and c_idx are row and column indices.
|
||||||
|
error : float
|
||||||
|
Assignment error, percentage of text area that lies outside
|
||||||
|
a cell.
|
||||||
|
+-------+
|
||||||
|
| |
|
||||||
|
| [Text bounding box]
|
||||||
|
| |
|
||||||
|
+-------+
|
||||||
|
|
||||||
"""
|
"""
|
||||||
r_idx, c_idx = [-1] * 2
|
r_idx, c_idx = [-1] * 2
|
||||||
|
|
@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
"""
|
"""Calculates a score based on weights assigned to various
|
||||||
|
parameters and their error percentages.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
error_weights
|
error_weights : list
|
||||||
|
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
|
||||||
|
where pn is the weight assigned to list of errors en.
|
||||||
|
Sum of pn should be equal to 100.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
score : float
|
||||||
|
|
||||||
"""
|
"""
|
||||||
SCORE_VAL = 100
|
SCORE_VAL = 100
|
||||||
|
|
@ -474,50 +564,40 @@ def compute_accuracy(error_weights):
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def count_empty_strings(d):
|
def compute_whitespace(d):
|
||||||
"""
|
"""Calculates the percentage of empty strings in a
|
||||||
|
two-dimensional list.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
d
|
d : list
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
whitespace : float
|
||||||
|
Percentage of empty cells.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
empty_p = 0
|
whitespace = 0
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
r_nempty_cells, c_nempty_cells = [], []
|
||||||
for i in d:
|
for i in d:
|
||||||
for j in i:
|
for j in i:
|
||||||
if j.strip() == '':
|
if j.strip() == '':
|
||||||
empty_p += 1
|
whitespace += 1
|
||||||
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
|
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
||||||
for row in d:
|
return whitespace
|
||||||
r_nempty_c = 0
|
|
||||||
for r in row:
|
|
||||||
if r.strip() != '':
|
|
||||||
r_nempty_c += 1
|
|
||||||
r_nempty_cells.append(r_nempty_c)
|
|
||||||
d = zip(*d)
|
|
||||||
d = [list(col) for col in d]
|
|
||||||
for col in d:
|
|
||||||
c_nempty_c = 0
|
|
||||||
for c in col:
|
|
||||||
if c.strip() != '':
|
|
||||||
c_nempty_c += 1
|
|
||||||
c_nempty_cells.append(c_nempty_c)
|
|
||||||
return empty_p, r_nempty_cells, c_nempty_cells
|
|
||||||
|
|
||||||
|
|
||||||
def remove_empty_strings(d):
|
def remove_empty(d):
|
||||||
"""
|
"""Removes empty rows and columns from a two-dimensional list.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
d
|
d : list
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
d : list
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for i, row in enumerate(d):
|
for i, row in enumerate(d):
|
||||||
|
|
@ -530,70 +610,46 @@ def remove_empty_strings(d):
|
||||||
|
|
||||||
|
|
||||||
def encode_(ar):
|
def encode_(ar):
|
||||||
"""
|
"""Encodes two-dimensional list into unicode.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
ar
|
ar : list
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
ar : list
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
def get_text_objects(layout, ltype="char", t=None):
|
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
"""
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
layout
|
|
||||||
ltype
|
|
||||||
t
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
if ltype == "char":
|
|
||||||
LTObject = LTChar
|
|
||||||
elif ltype == "lh":
|
|
||||||
LTObject = LTTextLineHorizontal
|
|
||||||
elif ltype == "lv":
|
|
||||||
LTObject = LTTextLineVertical
|
|
||||||
if t is None:
|
|
||||||
t = []
|
|
||||||
try:
|
|
||||||
for obj in layout._objs:
|
|
||||||
if isinstance(obj, LTObject):
|
|
||||||
t.append(obj)
|
|
||||||
else:
|
|
||||||
t += get_text_objects(obj, ltype=ltype)
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|
||||||
detect_vertical=True, all_texts=True):
|
detect_vertical=True, all_texts=True):
|
||||||
"""
|
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||||
|
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||||
|
of kwargs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
pname
|
filename : string
|
||||||
char_margin
|
Path to pdf file.
|
||||||
line_margin
|
char_margin : float
|
||||||
word_margin
|
line_margin : float
|
||||||
detect_vertical
|
word_margin : float
|
||||||
all_texts
|
detect_vertical : bool
|
||||||
|
all_texts : bool
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
layout : object
|
||||||
|
PDFMiner LTPage object.
|
||||||
|
dim : tuple
|
||||||
|
Dimension of pdf page in the form (width, height).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(pname, 'r') as f:
|
with open(filename, 'r') as f:
|
||||||
parser = PDFParser(f)
|
parser = PDFParser(f)
|
||||||
document = PDFDocument(parser)
|
document = PDFDocument(parser)
|
||||||
if not document.is_extractable:
|
if not document.is_extractable:
|
||||||
|
|
@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
return layout, dim
|
return layout, dim
|
||||||
|
|
||||||
|
|
||||||
def merge_tuples(tuples):
|
def get_text_objects(layout, ltype="char", t=None):
|
||||||
"""
|
"""Recursively parses pdf layout to get a list of
|
||||||
|
PDFMiner text objects.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
tuples
|
layout : object
|
||||||
|
PDFMiner LTPage object.
|
||||||
|
ltype : string
|
||||||
|
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
||||||
|
and LTTextLineVertical objects respectively.
|
||||||
|
t : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if ltype == "char":
|
||||||
|
LTObject = LTChar
|
||||||
|
elif ltype == "lh":
|
||||||
|
LTObject = LTTextLineHorizontal
|
||||||
|
elif ltype == "lv":
|
||||||
|
LTObject = LTTextLineVertical
|
||||||
|
if t is None:
|
||||||
|
t = []
|
||||||
|
try:
|
||||||
|
for obj in layout._objs:
|
||||||
|
if isinstance(obj, LTObject):
|
||||||
|
t.append(obj)
|
||||||
|
else:
|
||||||
|
t += get_text_objects(obj, ltype=ltype)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tuples(tuples):
|
||||||
|
"""Merges a list of overlapping tuples.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tuples : list
|
||||||
|
List of tuples where a tuple is a single axis coordinate pair.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
tuple
|
||||||
|
|
||||||
"""
|
"""
|
||||||
merged = list(tuples[0])
|
merged = list(tuples[0])
|
||||||
for s, e in tuples:
|
for s, e in tuples:
|
||||||
|
|
|
||||||
38
docs/api.rst
38
docs/api.rst
|
|
@ -4,17 +4,37 @@
|
||||||
API Reference
|
API Reference
|
||||||
=============
|
=============
|
||||||
|
|
||||||
Pdf
|
camelot.read_pdf
|
||||||
===
|
================
|
||||||
.. automodule:: camelot.pdf
|
.. automodule:: camelot.read_pdf
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
Lattice
|
camelot.handlers.PDFHandler
|
||||||
=======
|
===========================
|
||||||
.. automodule:: camelot.lattice
|
.. automodule:: camelot.handlers.PDFHandler
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
Stream
|
camelot.parsers.Stream
|
||||||
======
|
======================
|
||||||
.. automodule:: camelot.stream
|
.. automodule:: camelot.parsers.Stream
|
||||||
|
:members:
|
||||||
|
|
||||||
|
camelot.parsers.Lattice
|
||||||
|
=======================
|
||||||
|
.. automodule:: camelot.parsers.Lattice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
camelot.core.Cell
|
||||||
|
=================
|
||||||
|
.. automodule:: camelot.core.Cell
|
||||||
|
:members:
|
||||||
|
|
||||||
|
camelot.core.Table
|
||||||
|
==================
|
||||||
|
.. automodule:: camelot.core.Table
|
||||||
|
:members:
|
||||||
|
|
||||||
|
camelot.core.TableList
|
||||||
|
======================
|
||||||
|
.. automodule:: camelot.core.TableList
|
||||||
:members:
|
:members:
|
||||||
100
docs/index.rst
100
docs/index.rst
|
|
@ -3,11 +3,11 @@
|
||||||
You can adapt this file completely to your liking, but it should at least
|
You can adapt this file completely to your liking, but it should at least
|
||||||
contain the root `toctree` directive.
|
contain the root `toctree` directive.
|
||||||
|
|
||||||
==================================
|
=====================================
|
||||||
Camelot: pdf parsing made simpler!
|
Camelot: PDF Table Parsing for Humans
|
||||||
==================================
|
=====================================
|
||||||
|
|
||||||
Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files.
|
Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
|
||||||
|
|
||||||
Why another pdf table parsing library?
|
Why another pdf table parsing library?
|
||||||
======================================
|
======================================
|
||||||
|
|
@ -32,12 +32,22 @@ Usage
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> from camelot.pdf import Pdf
|
>>> import camelot
|
||||||
>>> from camelot.lattice import Lattice
|
>>> tables = camelot.read_pdf("foo.pdf")
|
||||||
|
>>> tables
|
||||||
>>> manager = Pdf(Lattice(), 'us-030.pdf')
|
<TableList n=2>
|
||||||
>>> tables = manager.extract()
|
>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
|
||||||
>>> print tables['page-1']['table-1']['data']
|
>>> tables[0]
|
||||||
|
<Table shape=(3,4)>
|
||||||
|
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||||
|
>>> tables[0].parsing_report
|
||||||
|
{
|
||||||
|
"accuracy": 96,
|
||||||
|
"whitespace": 80,
|
||||||
|
"order": 1,
|
||||||
|
"page": 1
|
||||||
|
}
|
||||||
|
>>> df = tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||||
|
|
@ -49,45 +59,6 @@ Usage
|
||||||
"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
|
"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
|
||||||
"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
|
"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
|
||||||
|
|
||||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
Camelot: PDF parsing made simpler!
|
|
||||||
|
|
||||||
usage:
|
|
||||||
camelot [options] <method> [<args>...]
|
|
||||||
|
|
||||||
options:
|
|
||||||
-h, --help Show this screen.
|
|
||||||
-v, --version Show version.
|
|
||||||
-V, --verbose Verbose.
|
|
||||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
|
||||||
Example: -p 1,3-6,10 [default: 1]
|
|
||||||
-P, --parallel Parallelize the parsing process.
|
|
||||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
|
||||||
-l, --log Log to file.
|
|
||||||
-o, --output <directory> Output directory.
|
|
||||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
|
||||||
grouped together to form a word. [default: 1.0]
|
|
||||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
||||||
grouped together to form a textbox. [default: 0.5]
|
|
||||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
||||||
if distance between words is greater than word
|
|
||||||
margin. [default: 0.1]
|
|
||||||
-J, --split_text Split text lines if they span across multiple cells.
|
|
||||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
|
||||||
Useful for super and subscripts.
|
|
||||||
-X, --print-stats List stats on the parsing process.
|
|
||||||
-Y, --save-stats Save stats to a file.
|
|
||||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
|
||||||
|
|
||||||
camelot methods:
|
|
||||||
lattice Looks for lines between data.
|
|
||||||
stream Looks for spaces between data.
|
|
||||||
|
|
||||||
See 'camelot <method> -h' for more information on a specific method.
|
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
============
|
============
|
||||||
|
|
||||||
|
|
@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can
|
||||||
|
|
||||||
pip install -U pip setuptools
|
pip install -U pip setuptools
|
||||||
|
|
||||||
The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_.
|
The dependencies include `tk`_ and `ghostscript`_.
|
||||||
|
|
||||||
.. _numpy: http://www.numpy.org/
|
.. _tk: https://wiki.tcl.tk/3743
|
||||||
.. _OpenCV: http://opencv.org/
|
.. _ghostscript: https://www.ghostscript.com/
|
||||||
.. _ImageMagick: http://www.imagemagick.org/script/index.php
|
|
||||||
|
|
||||||
Installing dependencies
|
Installing dependencies
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
|
tk and ghostscript can be installed using your system's default package manager.
|
||||||
|
|
||||||
Linux
|
Linux
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
* Arch Linux
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
sudo pacman -S opencv imagemagick
|
|
||||||
|
|
||||||
* Ubuntu
|
* Ubuntu
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
sudo apt-get install libopencv-dev python-opencv imagemagick
|
sudo apt-get install python-opencv python-tk ghostscript
|
||||||
|
|
||||||
|
* Arch Linux
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
sudo pacman -S opencv tk ghostscript
|
||||||
|
|
||||||
OS X
|
OS X
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
brew install homebrew/science/opencv imagemagick
|
brew install homebrew/science/opencv ghostscript
|
||||||
|
|
||||||
Finally, `cd` into the project directory and install by::
|
Finally, `cd` into the project directory and install by::
|
||||||
|
|
||||||
make install
|
python setup.py install
|
||||||
|
|
||||||
API Reference
|
API Reference
|
||||||
=============
|
=============
|
||||||
|
|
@ -150,14 +120,14 @@ You can check the latest sources with the command::
|
||||||
Contributing
|
Contributing
|
||||||
------------
|
------------
|
||||||
|
|
||||||
See :doc:`Contributing doc <contributing>`.
|
See :doc:`Contributing guidelines <contributing>`.
|
||||||
|
|
||||||
Testing
|
Testing
|
||||||
-------
|
-------
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
make test
|
python setup.py test
|
||||||
|
|
||||||
License
|
License
|
||||||
=======
|
=======
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
click==6.7
|
||||||
|
matplotlib==2.2.3
|
||||||
|
numpy==1.13.3
|
||||||
|
opencv-python==3.4.2.17
|
||||||
|
pandas==0.23.4
|
||||||
|
pdfminer==20140328
|
||||||
|
Pillow==5.2.0
|
||||||
|
PyPDF2==1.26.0
|
||||||
|
pytest==3.8.0
|
||||||
|
pytest-runner==4.2
|
||||||
|
Sphinx==1.8.0b1
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
docopt==0.6.2
|
click==6.7
|
||||||
matplotlib==2.2.3
|
matplotlib==2.2.3
|
||||||
nose==1.3.7
|
numpy==1.13.3
|
||||||
|
opencv-python==3.4.2.17
|
||||||
|
pandas==0.23.4
|
||||||
pdfminer==20140328
|
pdfminer==20140328
|
||||||
pyexcel-xlsx==0.5.6
|
|
||||||
Pillow==5.2.0
|
Pillow==5.2.0
|
||||||
PyPDF2==1.26.0
|
PyPDF2==1.26.0
|
||||||
Sphinx==1.8.0b1
|
|
||||||
16
setup.py
16
setup.py
|
|
@ -4,12 +4,12 @@ import camelot
|
||||||
|
|
||||||
NAME = 'camelot'
|
NAME = 'camelot'
|
||||||
VERSION = camelot.__version__
|
VERSION = camelot.__version__
|
||||||
DESCRIPTION = 'camelot parses tables from PDFs!'
|
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||||
with open('README.md') as f:
|
with open('README.md') as f:
|
||||||
LONG_DESCRIPTION = f.read()
|
LONG_DESCRIPTION = f.read()
|
||||||
URL = 'https://github.com/socialcopsdev/camelot'
|
URL = 'https://github.com/socialcopsdev/camelot'
|
||||||
AUTHOR = 'Vinayak Mehta'
|
AUTHOR = 'Vinayak Mehta'
|
||||||
AUTHOR_EMAIL = 'vinayak@socialcops.com'
|
AUTHOR_EMAIL = 'vmehta94@gmail.com'
|
||||||
LICENSE = 'BSD License'
|
LICENSE = 'BSD License'
|
||||||
|
|
||||||
opencv_min_version = '2.4.8'
|
opencv_min_version = '2.4.8'
|
||||||
|
|
@ -58,18 +58,14 @@ def setup_package():
|
||||||
|
|
||||||
opencv_status = get_opencv_status()
|
opencv_status = get_opencv_status()
|
||||||
opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
|
opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
|
||||||
instructions = ("Installation instructions are available in the README at "
|
|
||||||
"https://github.com/socialcopsdev/camelot")
|
|
||||||
|
|
||||||
if opencv_status['up_to_date'] is False:
|
if opencv_status['up_to_date'] is False:
|
||||||
if opencv_status['version']:
|
if opencv_status['version']:
|
||||||
raise ImportError("Your installation of OpenCV "
|
raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
|
||||||
"{0} is out-of-date.\n{1}{2}"
|
.format(opencv_status['version'], opencv_req_str))
|
||||||
.format(opencv_status['version'],
|
|
||||||
opencv_req_str, instructions))
|
|
||||||
else:
|
else:
|
||||||
raise ImportError("OpenCV is not installed.\n{0}{1}"
|
raise ImportError("OpenCV is not installed.\n{}"
|
||||||
.format(opencv_req_str, instructions))
|
.format(opencv_req_str))
|
||||||
|
|
||||||
setup(**metadata)
|
setup(**metadata)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue