Add docstrings and update docs

pull/2/head
Vinayak Mehta 2018-09-09 10:00:22 +05:30
parent 16c6b8d45d
commit 9878de4dfc
16 changed files with 997 additions and 421 deletions

2
.gitignore vendored
View File

@ -8,3 +8,5 @@ dist/
.coverage .coverage
.pytest_cache/ .pytest_cache/
_build/
_static/

View File

@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
>>> df = tables[0].df >>> df = tables[0].df
</pre> </pre>
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
<pre>
Camelot: PDF parsing made simpler!
usage:
camelot [options] &lt;method&gt; [&lt;args&gt;...]
options:
-h, --help Show this screen.
-v, --version Show version.
-V, --verbose Verbose.
-p, --pages &lt;pageno&gt; Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process.
-f, --format &lt;format&gt; Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Log to file.
-o, --output &lt;directory&gt; Output directory.
-M, --cmargin &lt;cmargin&gt; Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin &lt;lmargin&gt; Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin &lt;wmargin&gt; Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-J, --split_text Split text lines if they span across multiple cells.
-K, --flag_size Flag substring if its size differs from the whole string.
Useful for super and subscripts.
-X, --print-stats List stats on the parsing process.
-Y, --save-stats Save stats to a file.
-Z, --plot &lt;dist&gt; Plot distributions. (page,all,rc)
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
See 'camelot &lt;method&gt; -h' for more information on a specific method.
</pre>
## Dependencies ## Dependencies
Currently, camelot works under Python 2.7. The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/).
## Installation ## Installation
@ -78,22 +37,22 @@ pip install -U pip setuptools
### Installing dependencies ### Installing dependencies
numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager. tk and ghostscript can be installed using your system's default package manager.
#### Linux #### Linux
* Arch Linux
<pre>
sudo pacman -S opencv tk ghostscript
</pre>
* Ubuntu * Ubuntu
<pre> <pre>
sudo apt-get install python-opencv python-tk ghostscript sudo apt-get install python-opencv python-tk ghostscript
</pre> </pre>
* Arch Linux
<pre>
sudo pacman -S opencv tk ghostscript
</pre>
#### OS X #### OS X
<pre> <pre>
@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
Finally, `cd` into the project directory and install by Finally, `cd` into the project directory and install by
<pre> <pre>
make install python setup.py install
</pre> </pre>
## Development ## Development
@ -118,14 +77,14 @@ git clone https://github.com/socialcopsdev/camelot.git
### Contributing ### Contributing
See [Contributing doc](). See [Contributing guidelines]().
### Testing ### Testing
<pre> <pre>
make test python setup.py test
</pre> </pre>
## License ## License
BSD License BSD License

View File

@ -8,9 +8,48 @@ import pandas as pd
class Cell(object): class Cell(object):
""" """Defines a cell in a table with coordinates relative to a
left-bottom origin. (pdf coordinate space)
Parameters
----------
x1 : float
x-coordinate of left-bottom point.
y1 : float
y-coordinate of left-bottom point.
x2 : float
x-coordinate of right-top point.
y2 : float
y-coordinate of right-top point.
Attributes
----------
lb : tuple
Tuple representing left-bottom coordinates.
lt : tuple
Tuple representing left-top coordinates.
rb : tuple
Tuple representing right-bottom coordinates.
rt : tuple
Tuple representing right-top coordinates.
left : bool
Whether or not cell is bounded on the left.
right : bool
Whether or not cell is bounded on the right.
top : bool
Whether or not cell is bounded on the top.
bottom : bool
Whether or not cell is bounded on the bottom.
hspan : bool
Whether or not cell spans horizontally.
vspan : bool
Whether or not cell spans vertically.
text : string
Text assigned to cell.
bound
""" """
def __init__(self, x1, y1, x2, y2): def __init__(self, x1, y1, x2, y2):
self.x1 = x1 self.x1 = x1
self.y1 = y1 self.y1 = y1
@ -34,37 +73,48 @@ class Cell(object):
@property @property
def text(self): def text(self):
"""
Returns
-------
"""
return self._text return self._text
@text.setter @text.setter
def text(self, t): def text(self, t):
"""
Parameters
----------
t
"""
self._text = ''.join([self._text, t]) self._text = ''.join([self._text, t])
@property @property
def bound(self): def bound(self):
""" """The number of sides on which the cell is bounded.
Returns
-------
""" """
return self.top + self.bottom + self.left + self.right return self.top + self.bottom + self.left + self.right
class Table(object): class Table(object):
""" """Defines a table with coordinates relative to a left-bottom
origin. (pdf coordinate space)
Parameters
----------
cols : list
List of tuples representing column x-coordinates in increasing
order.
rows : list
List of tuples representing row y-coordinates in decreasing
order.
Attributes
----------
df : object
pandas.DataFrame
shape : tuple
Shape of the table.
accuracy : float
Accuracy with which text was assigned to the cell.
whitespace : float
Percentage of whitespace in the table.
order : int
Table number on pdf page.
page : int
Pdf page number.
data
parsing_report
""" """
def __init__(self, cols, rows): def __init__(self, cols, rows):
@ -84,11 +134,7 @@ class Table(object):
@property @property
def data(self): def data(self):
""" """Returns two-dimensional list of strings in table.
Returns
-------
""" """
d = [] d = []
for row in self.cells: for row in self.cells:
@ -97,11 +143,8 @@ class Table(object):
@property @property
def parsing_report(self): def parsing_report(self):
""" """Returns a parsing report with accuracy, %whitespace,
table number on page and page number.
Returns
-------
""" """
# pretty? # pretty?
report = { report = {
@ -112,27 +155,8 @@ class Table(object):
} }
return report return report
def set_border(self):
"""
Returns
-------
"""
for r in range(len(self.rows)):
self.cells[r][0].left = True
self.cells[r][len(self.cols) - 1].right = True
for c in range(len(self.cols)):
self.cells[0][c].top = True
self.cells[len(self.rows) - 1][c].bottom = True
return self
def set_all_edges(self): def set_all_edges(self):
""" """Sets all table edges to True.
Returns
-------
""" """
for row in self.cells: for row in self.cells:
for cell in row: for cell in row:
@ -140,16 +164,16 @@ class Table(object):
return self return self
def set_edges(self, vertical, horizontal, joint_close_tol=2): def set_edges(self, vertical, horizontal, joint_close_tol=2):
""" """Sets a cell's edges to True depending on whether the cell's
coordinates overlap with the line's coordinates within a
tolerance.
Parameters Parameters
---------- ----------
vertical vertical : list
horizontal List of detected vertical lines.
joint_close_tol horizontal : list
List of detected horizontal lines.
Returns
-------
""" """
for v in vertical: for v in vertical:
@ -256,12 +280,20 @@ class Table(object):
return self return self
def set_span(self): def set_border(self):
"""Sets table border edges to True.
""" """
for r in range(len(self.rows)):
self.cells[r][0].left = True
self.cells[r][len(self.cols) - 1].right = True
for c in range(len(self.cols)):
self.cells[0][c].top = True
self.cells[len(self.rows) - 1][c].bottom = True
return self
Returns def set_span(self):
------- """Sets a cell's hspan or vspan attribute to True depending
on whether the cell spans horizontally or vertically.
""" """
for row in self.cells: for row in self.cells:
for cell in row: for cell in row:
@ -288,6 +320,8 @@ class Table(object):
return self return self
def to_csv(self, path, **kwargs): def to_csv(self, path, **kwargs):
"""Write Table to a comma-separated values (csv) file.
"""
kw = { kw = {
'encoding': 'utf-8', 'encoding': 'utf-8',
'index': False, 'index': False,
@ -297,6 +331,8 @@ class Table(object):
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
def to_json(self, path, **kwargs): def to_json(self, path, **kwargs):
"""Write Table to a JSON file.
"""
kw = { kw = {
'orient': 'records' 'orient': 'records'
} }
@ -306,6 +342,8 @@ class Table(object):
f.write(json_string) f.write(json_string)
def to_excel(self, path, **kwargs): def to_excel(self, path, **kwargs):
"""Write Table to an Excel file.
"""
kw = { kw = {
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
'encoding': 'utf-8' 'encoding': 'utf-8'
@ -316,13 +354,21 @@ class Table(object):
writer.save() writer.save()
def to_html(self, path, **kwargs): def to_html(self, path, **kwargs):
"""Write Table to an HTML file.
"""
html_string = self.df.to_html(**kwargs) html_string = self.df.to_html(**kwargs)
with open(path, 'w') as f: with open(path, 'w') as f:
f.write(html_string) f.write(html_string)
class TableList(object): class TableList(object):
""" """Defines a list of camelot.core.Table objects. Each table can
be accessed using its index.
Attributes
----------
n : int
Number of tables in the list.
""" """
def __init__(self, tables): def __init__(self, tables):
@ -371,6 +417,18 @@ class TableList(object):
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
def export(self, path, f='csv', compress=False): def export(self, path, f='csv', compress=False):
"""Exports the list of tables to specified file format.
Parameters
----------
path : str
Filepath
f : str
File format. Can be csv, json, excel and html.
compress : bool
Whether or not to add files to a ZIP archive.
"""
dirname = os.path.dirname(path) dirname = os.path.dirname(path)
basename = os.path.basename(path) basename = os.path.basename(path)
root, ext = os.path.splitext(basename) root, ext = os.path.splitext(basename)
@ -402,9 +460,6 @@ class TableList(object):
class Geometry(object): class Geometry(object):
"""
"""
def __init__(self): def __init__(self):
self.text = [] self.text = []
self.images = () self.images = ()
@ -421,9 +476,6 @@ class Geometry(object):
class GeometryList(object): class GeometryList(object):
"""
"""
def __init__(self, geometry): def __init__(self, geometry):
self.text = [g.text for g in geometry] self.text = [g.text for g in geometry]
self.images = [g.images for g in geometry] self.images = [g.images for g in geometry]

View File

@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation
class PDFHandler(object): class PDFHandler(object):
""" """Handles all operations like temp directory creation, splitting
file into single page pdfs, parsing each pdf and then removing the
temp directory.
Parameter
---------
filename : str
Path to pdf file.
pages : str
Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
""" """
def __init__(self, filename, pages='1'): def __init__(self, filename, pages='1'):
self.filename = filename self.filename = filename
if not self.filename.endswith('.pdf'): if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.") raise TypeError("File format not supported.")
self.pages = self.__get_pages(self.filename, pages) self.pages = self._get_pages(self.filename, pages)
self.tempdir = tempfile.mkdtemp() self.tempdir = tempfile.mkdtemp()
def __get_pages(self, filename, pages): def _get_pages(self, filename, pages):
# refactor """Converts pages string to list of ints.
Parameters
----------
filename : str
Path to pdf file.
pages : str
Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
Returns
-------
P : list
List of int page numbers.
"""
page_numbers = [] page_numbers = []
if pages == '1': if pages == '1':
page_numbers.append({'start': 1, 'end': 1}) page_numbers.append({'start': 1, 'end': 1})
@ -42,8 +67,19 @@ class PDFHandler(object):
P.extend(range(p['start'], p['end'] + 1)) P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P)) return sorted(set(P))
def __save_page(self, filename, page, temp): def _save_page(self, filename, page, temp):
# refactor """Saves specified page from pdf into a temporary directory.
Parameters
----------
filename : str
Path to pdf file.
page : int
Page number
temp : str
Tmp directory
"""
with open(filename, 'rb') as fileobj: with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
@ -65,28 +101,37 @@ class PDFHandler(object):
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
outfile = PdfFileWriter() outfile = PdfFileWriter()
p = infile.getPage(0) p = infile.getPage(0)
if rotation == 'left': if rotation == 'anticlockwise':
p.rotateClockwise(90) p.rotateClockwise(90)
elif rotation == 'right': elif rotation == 'clockwise':
p.rotateCounterClockwise(90) p.rotateCounterClockwise(90)
outfile.addPage(p) outfile.addPage(p)
with open(fpath, 'wb') as f: with open(fpath, 'wb') as f:
outfile.write(f) outfile.write(f)
def parse(self, mesh=False, **kwargs): def parse(self, mesh=False, **kwargs):
""" """Extracts tables by calling parser.get_tables on all single
page pdfs.
Parameters Parameters
---------- ----------
mesh mesh : bool (default: False)
kwargs Whether or not to use Lattice method of parsing. Stream
is used by default.
kwargs : dict
See camelot.read_pdf kwargs.
Returns Returns
------- -------
tables : camelot.core.TableList
List of tables found in pdf.
geometry : camelot.core.GeometryList
List of geometry objects (contours, lines, joints)
found in pdf.
""" """
for p in self.pages: for p in self.pages:
self.__save_page(self.filename, p, self.tempdir) self._save_page(self.filename, p, self.tempdir)
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p)) pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages] for p in self.pages]
tables = [] tables = []

View File

@ -9,17 +9,31 @@ from .utils import merge_tuples
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
""" """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
---------- ----------
imagename imagename : string
process_background Path to image file.
blocksize process_background : bool, optional (default: False)
c Whether or not to process lines that are in background.
blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
Returns Returns
------- -------
img : object
numpy.ndarray representing the original image.
threshold : object
numpy.ndarray representing the thresholded image.
""" """
img = cv2.imread(imagename) img = cv2.imread(imagename)
@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
""" """Finds horizontal and vertical lines by applying morphological
transformations on an image.
Parameters Parameters
---------- ----------
threshold threshold : object
direction numpy.ndarray representing the thresholded image.
line_size_scaling direction : string, optional (default: 'horizontal')
iterations Specifies whether to find vertical or horizontal lines.
line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
Returns Returns
------- -------
dmask : object
numpy.ndarray representing pixels where vertical/horizontal
lines lie.
lines : list
List of tuples representing vertical/horizontal lines with
coordinates relative to a left-top origin in
image coordinate space.
""" """
lines = [] lines = []
@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
def find_table_contours(vertical, horizontal): def find_table_contours(vertical, horizontal):
""" """Finds table boundaries using OpenCV's findContours.
Parameters Parameters
---------- ----------
vertical vertical : object
horizontal numpy.ndarray representing pixels where vertical lines lie.
horizontal : object
numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
cont : list
List of tuples representing table boundaries. Each tuple is of
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space.
""" """
mask = vertical + horizontal mask = vertical + horizontal
@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal):
def find_table_joints(contours, vertical, horizontal): def find_table_joints(contours, vertical, horizontal):
""" """Finds joints/intersections present inside each table boundary.
Parameters Parameters
---------- ----------
contours contours : list
vertical List of tuples representing table boundaries. Each tuple is of
horizontal the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space.
vertical : object
numpy.ndarray representing pixels where vertical lines lie.
horizontal : object
numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
tables : dict
Dict with table boundaries as keys and list of intersections
in that boundary as their value.
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
and (x2, y2) -> rt in image coordinate space.
""" """
joints = np.bitwise_and(vertical, horizontal) joints = np.bitwise_and(vertical, horizontal)
@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal):
def remove_lines(threshold, line_size_scaling=15): def remove_lines(threshold, line_size_scaling=15):
""" """Removes lines from a thresholded image.
Parameters Parameters
---------- ----------
threshold threshold : object
line_size_scaling numpy.ndarray representing the thresholded image.
line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Returns Returns
------- -------
threshold : object
numpy.ndarray representing the thresholded image
with horizontal and vertical lines removed.
""" """
size = threshold.shape[0] // line_size_scaling size = threshold.shape[0] // line_size_scaling
@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15):
def find_cuts(threshold, char_size_scaling=200): def find_cuts(threshold, char_size_scaling=200):
""" """Finds cuts made by text projections on y-axis.
Parameters Parameters
---------- ----------
threshold threshold : object
char_size_scaling numpy.ndarray representing the thresholded image.
line_size_scaling : int, optional (default: 200)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Returns Returns
------- -------
y_cuts : list
List of cuts on y-axis.
""" """
size = threshold.shape[0] // char_size_scaling size = threshold.shape[0] // char_size_scaling
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))

View File

@ -2,20 +2,93 @@ from .handlers import PDFHandler
def read_pdf(filepath, pages='1', mesh=False, **kwargs): def read_pdf(filepath, pages='1', mesh=False, **kwargs):
""" """Read PDF and return parsed data tables.
Note: kwargs annotated with ^ can only be used with mesh=False
and kwargs annotated with * can only be used with mesh=True.
Parameters Parameters
---------- ----------
filepath filepath : str
pages Path to pdf file.
mesh pages : str
kwargs Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream
is used by default.
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
col_close_tol^ : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
process_background* : bool, optional (default: False)
Whether or not to process lines that are in background.
line_size_scaling* : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
line_close_tol* : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
Returns Returns
------- -------
tables : camelot.core.TableList
""" """
# explicit type conversion # validate kwargs?
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
tables, __ = p.parse(mesh=mesh, **kwargs) tables, __ = p.parse(mesh=mesh, **kwargs)
return tables return tables

View File

@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects
class BaseParser(object): class BaseParser(object):
""" """Defines a base parser.
""" """
def _generate_layout(self, filename): def _generate_layout(self, filename):
self.filename = filename self.filename = filename

View File

@ -11,7 +11,7 @@ from .base import BaseParser
from ..core import Table from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy, merge_close_lines, get_table_index, compute_accuracy,
count_empty_strings, encode_, setup_logging) compute_whitespace, setup_logging, encode_)
from ..image_processing import (adaptive_threshold, find_lines, from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints) find_table_contours, find_table_joints)
@ -20,14 +20,74 @@ logger = setup_logging(__name__)
class Lattice(BaseParser): class Lattice(BaseParser):
""" """Lattice method of parsing looks for lines between text
to form a table.
Parameters
----------
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
process_background : bool, optional (default: False)
Whether or not to process lines that are in background.
line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
shift_text : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
line_close_tol : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
joint_close_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
""" """
def __init__(self, table_area=None, process_background=False, def __init__(self, table_area=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2, split_text=False, flag_size=False, line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, margins=(1.0, 0.5, 0.1), debug=None): iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
self.table_area = table_area self.table_area = table_area
self.process_background = process_background self.process_background = process_background
self.line_size_scaling = line_size_scaling self.line_size_scaling = line_size_scaling
@ -45,6 +105,27 @@ class Lattice(BaseParser):
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell.
Parameters
----------
table : camelot.core.Table
idx : list
List of tuples of the form (r_idx, c_idx, text).
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a
list to specify where the text in a spanning cell should
flow.
Returns
-------
indices : list
List of tuples of the form (r_idx, c_idx, text) where
r_idx and c_idx are new row and column indices for text.
"""
indices = [] indices = []
for r_idx, c_idx, text in idx: for r_idx, c_idx, text in idx:
for d in shift_text: for d in shift_text:
@ -69,6 +150,22 @@ class Lattice(BaseParser):
@staticmethod @staticmethod
def _copy_spanning_text(t, copy_text=None): def _copy_spanning_text(t, copy_text=None):
"""Copies over text in empty spanning cells.
Parameters
----------
t : camelot.core.Table
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
"""
for f in copy_text: for f in copy_text:
if f == "h": if f == "h":
for i in range(len(t.cells)): for i in range(len(t.cells)):
@ -199,7 +296,7 @@ class Lattice(BaseParser):
table.df = pd.DataFrame(data) table.df = pd.DataFrame(data)
table.shape = table.df.shape table.shape = table.df.shape
whitespace, __, __ = count_empty_strings(data) whitespace = compute_whitespace(data)
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
@ -208,16 +305,6 @@ class Lattice(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
"""
Parameters
----------
filename
Returns
-------
"""
logger.info('Processing {}'.format(os.path.basename(filename))) logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename) self._generate_layout(filename)
@ -237,7 +324,7 @@ class Lattice(BaseParser):
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table) _tables.append(table)
if self.debug is not None: if self.debug:
text = [] text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])

View File

@ -8,19 +8,54 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table from ..core import Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
count_empty_strings, encode_, setup_logging) compute_whitespace, setup_logging, encode_)
logger = setup_logging(__name__) logger = setup_logging(__name__)
class Stream(BaseParser): class Stream(BaseParser):
""" """Stream method of parsing looks for spaces between text
to form a table.
If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal.
Parameters
----------
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
row_close_tol : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
col_close_tol : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s), columns and debugging.
""" """
def __init__(self, table_area=None, columns=None, split_text=False, def __init__(self, table_area=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0, flag_size=False, row_close_tol=2, col_close_tol=0,
margins=(1.0, 0.5, 0.1), debug=None): margins=(1.0, 0.5, 0.1), debug=False):
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
@ -33,6 +68,20 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _text_bbox(t_bbox): def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
@ -42,6 +91,21 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _group_rows(text, row_close_tol=2): def _group_rows(text, row_close_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_close_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = 0 row_y = 0
rows = [] rows = []
temp = [] temp = []
@ -61,6 +125,21 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _merge_columns(l, col_close_tol=0): def _merge_columns(l, col_close_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
col_close_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = [] merged = []
for higher in l: for higher in l:
if not merged: if not merged:
@ -89,6 +168,21 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min): def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped] if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
@ -100,6 +194,23 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _add_columns(cols, text, row_close_tol): def _add_columns(cols, text, row_close_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text: if text:
text = Stream._group_rows(text, row_close_tol=row_close_tol) text = Stream._group_rows(text, row_close_tol=row_close_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
@ -110,6 +221,21 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _join_columns(cols, text_x_min, text_x_max): def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols) cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min) cols.insert(0, text_x_min)
@ -207,7 +333,7 @@ class Stream(BaseParser):
table.df = pd.DataFrame(data) table.df = pd.DataFrame(data)
table.shape = table.df.shape table.shape = table.df.shape
whitespace, __, __ = count_empty_strings(data) whitespace = compute_whitespace(data)
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
@ -216,16 +342,6 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
"""
Parameters
----------
filename
Returns
-------
"""
logger.info('Processing {}'.format(os.path.basename(filename))) logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename) self._generate_layout(filename)
@ -244,7 +360,7 @@ class Stream(BaseParser):
table = self._generate_table(table_idx, cols, rows) table = self._generate_table(table_idx, cols, rows)
_tables.append(table) _tables.append(table)
if self.debug is not None: if self.debug:
text = [] text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])

View File

@ -6,19 +6,101 @@ from .handlers import PDFHandler
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
""" """Plot geometry found on pdf page based on type specified,
useful for debugging and playing with different parameters to get
the best output.
Note: kwargs annotated with ^ can only be used with mesh=False
and kwargs annotated with * can only be used with mesh=True.
Parameters Parameters
---------- ----------
filepath filepath : str
pages Path to pdf file.
mesh pages : str
geometry_type Comma-separated page numbers to parse.
kwargs Example: 1,3,4 or 1,4-end
mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream
is used by default.
geometry_type : str, optional (default: 'text')
'text' : Plot text objects found on page, useful to get
table_area and columns coordinates.
'table' : Plot parsed table.
'contour'* : Plot detected rectangles.
'joint'* : Plot detected line intersections.
'line'* : Plot detected lines.
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
col_close_tol^ : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
process_background* : bool, optional (default: False)
Whether or not to process lines that are in background.
line_size_scaling* : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
line_close_tol* : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
""" """
# explicit type conversion # validate kwargs?
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs.update({'debug': geometry_type}) debug = True if geometry_type else False
kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs) __, geometry = p.parse(mesh=mesh, **kwargs)
if geometry_type == 'text': if geometry_type == 'text':

View File

@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
def setup_logging(name): def setup_logging(name):
""" """Sets up a logger with StreamHandler.
Parameters Parameters
---------- ----------
name name : str
Returns Returns
------- -------
logger : logging.Logger
""" """
logger = logging.getLogger(name) logger = logging.getLogger(name)
@ -47,15 +48,16 @@ logger = setup_logging(__name__)
def translate(x1, x2): def translate(x1, x2):
""" """Translates x2 by x1.
Parameters Parameters
---------- ----------
x1 x1 : float
x2 x2 : float
Returns Returns
------- -------
x2 : float
""" """
x2 += x1 x2 += x1
@ -63,15 +65,16 @@ def translate(x1, x2):
def scale(x, s): def scale(x, s):
""" """Scales x by scaling factor s.
Parameters Parameters
---------- ----------
x x : float
s s : float
Returns Returns
------- -------
x : float
""" """
x *= s x *= s
@ -79,18 +82,21 @@ def scale(x, s):
def rotate(x1, y1, x2, y2, angle): def rotate(x1, y1, x2, y2, angle):
""" """Rotates point x2, y2 about point x1, y1 by angle.
Parameters Parameters
---------- ----------
x1 x1 : float
y1 y1 : float
x2 x2 : float
y2 y2 : float
angle angle : float
Angle in radians.
Returns Returns
------- -------
xnew : float
ynew : float
""" """
s = np.sin(angle) s = np.sin(angle)
@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle):
def scale_pdf(k, factors): def scale_pdf(k, factors):
""" """Translates and scales pdf coordinate space to image
coordinate space.
Parameters Parameters
---------- ----------
k k : tuple
factors Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
space.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
first two elements are scaling factors and pdf_y is height of
pdf.
Returns Returns
------- -------
knew : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
space.
""" """
x1, y1, x2, y2 = k x1, y1, x2, y2 = k
@ -127,17 +144,28 @@ def scale_pdf(k, factors):
def scale_image(tables, v_segments, h_segments, factors): def scale_image(tables, v_segments, h_segments, factors):
""" """Translates and scales image coordinate space to pdf
coordinate space.
Parameters Parameters
---------- ----------
tables tables : dict
v_segments Dict with table boundaries as keys and list of intersections
h_segments in that boundary as value.
factors v_segments : list
List of vertical line segments.
h_segments : list
List of horizontal line segments.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
first two elements are scaling factors and img_y is height of
image.
Returns Returns
------- -------
tables_new : dict
v_segments_new : dict
h_segments_new : dict
""" """
scaling_factor_x, scaling_factor_y, img_y = factors scaling_factor_x, scaling_factor_y, img_y = factors
@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors):
def get_rotation(lttextlh, lttextlv, ltchar): def get_rotation(lttextlh, lttextlv, ltchar):
""" """Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters Parameters
---------- ----------
lttextlh lttextlh : list
lttextlv List of PDFMiner LTTextLineHorizontal objects.
ltchar lttextlv : list
List of PDFMiner LTTextLineVertical objects.
ltchar : list
List of PDFMiner LTChar objects.
Returns Returns
------- -------
rotation : string
'' if text in table is upright, 'left' if rotated 90 degree
anticlockwise and 'right' if rotated 90 degree clockwise.
""" """
rotation = '' rotation = ''
@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar):
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
rotation = 'left' if clockwise < anticlockwise else 'right' rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise'
return rotation return rotation
def segments_in_bbox(bbox, v_segments, h_segments): def segments_in_bbox(bbox, v_segments, h_segments):
""" """Returns all line segments present inside a bounding box.
Parameters Parameters
---------- ----------
bbox bbox : tuple
v_segments Tuple (x1, y1, x2, y2) representing a bounding box where
h_segments (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
space.
v_segments : list
List of vertical line segments.
h_segments : list
List of vertical horizontal segments.
Returns Returns
------- -------
v_s : list
List of vertical line segments that lie inside table.
h_s : list
List of horizontal line segments that lie inside table.
""" """
lb = (bbox[0], bbox[1]) lb = (bbox[0], bbox[1])
@ -217,35 +261,42 @@ def segments_in_bbox(bbox, v_segments, h_segments):
def text_in_bbox(bbox, text): def text_in_bbox(bbox, text):
""" """Returns all text objects present inside a bounding box.
Parameters Parameters
---------- ----------
bbox bbox : tuple
text Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
space.
text : List of PDFMiner text objects.
Returns Returns
------- -------
t_bbox : list
List of PDFMiner text objects that lie inside table.
""" """
lb = (bbox[0], bbox[1]) lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3]) rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
<= rt[1] + 2] <= rt[1] + 2]
return t_bbox return t_bbox
def remove_close_lines(ar, line_close_tol=2): def remove_close_lines(ar, line_close_tol=2):
""" """Removes lines which are within a tolerance, based on their x or
y axis projections.
Parameters Parameters
---------- ----------
ar ar : list
line_close_tol line_close_tol : int, optional (default: 2)
Returns Returns
------- -------
ret : list
""" """
ret = [] ret = []
@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2):
def merge_close_lines(ar, line_close_tol=2): def merge_close_lines(ar, line_close_tol=2):
""" """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections.
Parameters Parameters
---------- ----------
ar ar : list
line_close_tol line_close_tol : int, optional (default: 2)
Returns Returns
------- -------
ret : list
""" """
ret = [] ret = []
@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2):
def flag_font_size(textline, direction): def flag_font_size(textline, direction):
""" """Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.
Parameters Parameters
---------- ----------
textline textline : list
direction List of PDFMiner LTChar objects.
direction : string
Direction of the PDFMiner LTTextLine object.
Returns Returns
------- -------
fstring : string
""" """
if direction == 'horizontal': if direction == 'horizontal':
@ -324,18 +381,27 @@ def flag_font_size(textline, direction):
return fstring return fstring
def split_textline(table, textline, direction, flag_size=True): def split_textline(table, textline, direction, flag_size=False):
""" """Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
Parameters Parameters
---------- ----------
table table : camelot.core.Table
textline textline : object
direction PDFMiner LTTextLine object.
flag_size direction : string
Direction of the PDFMiner LTTextLine object.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
Returns Returns
------- -------
grouped_chars : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an lttextline substring.
""" """
idx = 0 idx = 0
@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True):
return grouped_chars return grouped_chars
def get_table_index(table, t, direction, split_text=False, flag_size=True): def get_table_index(table, t, direction, split_text=False, flag_size=False):
""" """Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
Parameters Parameters
---------- ----------
table table : camelot.core.Table
t t : object
direction PDFMiner LTTextLine object.
split_text direction : string
flag_size Direction of the PDFMiner LTTextLine object.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
Returns Returns
------- -------
indices : list
List of tuples of the form (r_idx, c_idx, text) where r_idx
and c_idx are row and column indices.
error : float
Assignment error, percentage of text area that lies outside
a cell.
+-------+
| |
| [Text bounding box]
| |
+-------+
""" """
r_idx, c_idx = [-1] * 2 r_idx, c_idx = [-1] * 2
@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
def compute_accuracy(error_weights): def compute_accuracy(error_weights):
""" """Calculates a score based on weights assigned to various
parameters and their error percentages.
Parameters Parameters
---------- ----------
error_weights error_weights : list
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
where pn is the weight assigned to list of errors en.
Sum of pn should be equal to 100.
Returns Returns
------- -------
score : float
""" """
SCORE_VAL = 100 SCORE_VAL = 100
@ -474,50 +564,40 @@ def compute_accuracy(error_weights):
return score return score
def count_empty_strings(d): def compute_whitespace(d):
""" """Calculates the percentage of empty strings in a
two-dimensional list.
Parameters Parameters
---------- ----------
d d : list
Returns Returns
------- -------
whitespace : float
Percentage of empty cells.
""" """
empty_p = 0 whitespace = 0
r_nempty_cells, c_nempty_cells = [], [] r_nempty_cells, c_nempty_cells = [], []
for i in d: for i in d:
for j in i: for j in i:
if j.strip() == '': if j.strip() == '':
empty_p += 1 whitespace += 1
empty_p = 100 * (empty_p / float(len(d) * len(d[0]))) whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
for row in d: return whitespace
r_nempty_c = 0
for r in row:
if r.strip() != '':
r_nempty_c += 1
r_nempty_cells.append(r_nempty_c)
d = zip(*d)
d = [list(col) for col in d]
for col in d:
c_nempty_c = 0
for c in col:
if c.strip() != '':
c_nempty_c += 1
c_nempty_cells.append(c_nempty_c)
return empty_p, r_nempty_cells, c_nempty_cells
def remove_empty_strings(d): def remove_empty(d):
""" """Removes empty rows and columns from a two-dimensional list.
Parameters Parameters
---------- ----------
d d : list
Returns Returns
------- -------
d : list
""" """
for i, row in enumerate(d): for i, row in enumerate(d):
@ -530,70 +610,46 @@ def remove_empty_strings(d):
def encode_(ar): def encode_(ar):
""" """Encodes two-dimensional list into unicode.
Parameters Parameters
---------- ----------
ar ar : list
Returns Returns
------- -------
ar : list
""" """
ar = [[r.encode('utf-8') for r in row] for row in ar] ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar return ar
def get_text_objects(layout, ltype="char", t=None): def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
"""
Parameters
----------
layout
ltype
t
Returns
-------
"""
if ltype == "char":
LTObject = LTChar
elif ltype == "lh":
LTObject = LTTextLineHorizontal
elif ltype == "lv":
LTObject = LTTextLineVertical
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += get_text_objects(obj, ltype=ltype)
except AttributeError:
pass
return t
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True): detect_vertical=True, all_texts=True):
""" """Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs.
Parameters Parameters
---------- ----------
pname filename : string
char_margin Path to pdf file.
line_margin char_margin : float
word_margin line_margin : float
detect_vertical word_margin : float
all_texts detect_vertical : bool
all_texts : bool
Returns Returns
------- -------
layout : object
PDFMiner LTPage object.
dim : tuple
Dimension of pdf page in the form (width, height).
""" """
with open(pname, 'r') as f: with open(filename, 'r') as f:
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
if not document.is_extractable: if not document.is_extractable:
@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
return layout, dim return layout, dim
def merge_tuples(tuples): def get_text_objects(layout, ltype="char", t=None):
""" """Recursively parses pdf layout to get a list of
PDFMiner text objects.
Parameters Parameters
---------- ----------
tuples layout : object
PDFMiner LTPage object.
ltype : string
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
and LTTextLineVertical objects respectively.
t : list
Returns
-------
t : list
List of PDFMiner text objects.
"""
if ltype == "char":
LTObject = LTChar
elif ltype == "lh":
LTObject = LTTextLineHorizontal
elif ltype == "lv":
LTObject = LTTextLineVertical
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += get_text_objects(obj, ltype=ltype)
except AttributeError:
pass
return t
def merge_tuples(tuples):
"""Merges a list of overlapping tuples.
Parameters
----------
tuples : list
List of tuples where a tuple is a single axis coordinate pair.
Yields
------
tuple
""" """
merged = list(tuples[0]) merged = list(tuples[0])
for s, e in tuples: for s, e in tuples:

View File

@ -4,17 +4,37 @@
API Reference API Reference
============= =============
Pdf camelot.read_pdf
=== ================
.. automodule:: camelot.pdf .. automodule:: camelot.read_pdf
:members: :members:
Lattice camelot.handlers.PDFHandler
======= ===========================
.. automodule:: camelot.lattice .. automodule:: camelot.handlers.PDFHandler
:members: :members:
Stream camelot.parsers.Stream
====== ======================
.. automodule:: camelot.stream .. automodule:: camelot.parsers.Stream
:members:
camelot.parsers.Lattice
=======================
.. automodule:: camelot.parsers.Lattice
:members:
camelot.core.Cell
=================
.. automodule:: camelot.core.Cell
:members:
camelot.core.Table
==================
.. automodule:: camelot.core.Table
:members:
camelot.core.TableList
======================
.. automodule:: camelot.core.TableList
:members: :members:

View File

@ -3,11 +3,11 @@
You can adapt this file completely to your liking, but it should at least You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive. contain the root `toctree` directive.
================================== =====================================
Camelot: pdf parsing made simpler! Camelot: PDF Table Parsing for Humans
================================== =====================================
Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files. Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
Why another pdf table parsing library? Why another pdf table parsing library?
====================================== ======================================
@ -32,12 +32,22 @@ Usage
:: ::
>>> from camelot.pdf import Pdf >>> import camelot
>>> from camelot.lattice import Lattice >>> tables = camelot.read_pdf("foo.pdf")
>>> tables
>>> manager = Pdf(Lattice(), 'us-030.pdf') <TableList n=2>
>>> tables = manager.extract() >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
>>> print tables['page-1']['table-1']['data'] >>> tables[0]
<Table shape=(3,4)>
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
>>> tables[0].parsing_report
{
"accuracy": 96,
"whitespace": 80,
"order": 1,
"page": 1
}
>>> df = tables[0].df
.. csv-table:: .. csv-table::
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -49,45 +59,6 @@ Usage
"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
::
Camelot: PDF parsing made simpler!
usage:
camelot [options] <method> [<args>...]
options:
-h, --help Show this screen.
-v, --version Show version.
-V, --verbose Verbose.
-p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Log to file.
-o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 1.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-J, --split_text Split text lines if they span across multiple cells.
-K, --flag_size Flag substring if its size differs from the whole string.
Useful for super and subscripts.
-X, --print-stats List stats on the parsing process.
-Y, --save-stats Save stats to a file.
-Z, --plot <dist> Plot distributions. (page,all,rc)
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
See 'camelot <method> -h' for more information on a specific method.
Installation Installation
============ ============
@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can
pip install -U pip setuptools pip install -U pip setuptools
The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_. The dependencies include `tk`_ and `ghostscript`_.
.. _numpy: http://www.numpy.org/ .. _tk: https://wiki.tcl.tk/3743
.. _OpenCV: http://opencv.org/ .. _ghostscript: https://www.ghostscript.com/
.. _ImageMagick: http://www.imagemagick.org/script/index.php
Installing dependencies Installing dependencies
----------------------- -----------------------
numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager. tk and ghostscript can be installed using your system's default package manager.
Linux Linux
^^^^^ ^^^^^
* Arch Linux
::
sudo pacman -S opencv imagemagick
* Ubuntu * Ubuntu
:: ::
sudo apt-get install libopencv-dev python-opencv imagemagick sudo apt-get install python-opencv python-tk ghostscript
* Arch Linux
::
sudo pacman -S opencv tk ghostscript
OS X OS X
^^^^ ^^^^
:: ::
brew install homebrew/science/opencv imagemagick brew install homebrew/science/opencv ghostscript
Finally, `cd` into the project directory and install by:: Finally, `cd` into the project directory and install by::
make install python setup.py install
API Reference API Reference
============= =============
@ -150,14 +120,14 @@ You can check the latest sources with the command::
Contributing Contributing
------------ ------------
See :doc:`Contributing doc <contributing>`. See :doc:`Contributing guidelines <contributing>`.
Testing Testing
------- -------
:: ::
make test python setup.py test
License License
======= =======

View File

@ -0,0 +1,11 @@
click==6.7
matplotlib==2.2.3
numpy==1.13.3
opencv-python==3.4.2.17
pandas==0.23.4
pdfminer==20140328
Pillow==5.2.0
PyPDF2==1.26.0
pytest==3.8.0
pytest-runner==4.2
Sphinx==1.8.0b1

View File

@ -1,8 +1,8 @@
docopt==0.6.2 click==6.7
matplotlib==2.2.3 matplotlib==2.2.3
nose==1.3.7 numpy==1.13.3
opencv-python==3.4.2.17
pandas==0.23.4
pdfminer==20140328 pdfminer==20140328
pyexcel-xlsx==0.5.6
Pillow==5.2.0 Pillow==5.2.0
PyPDF2==1.26.0 PyPDF2==1.26.0
Sphinx==1.8.0b1

View File

@ -4,12 +4,12 @@ import camelot
NAME = 'camelot' NAME = 'camelot'
VERSION = camelot.__version__ VERSION = camelot.__version__
DESCRIPTION = 'camelot parses tables from PDFs!' DESCRIPTION = 'PDF Table Parsing for Humans'
with open('README.md') as f: with open('README.md') as f:
LONG_DESCRIPTION = f.read() LONG_DESCRIPTION = f.read()
URL = 'https://github.com/socialcopsdev/camelot' URL = 'https://github.com/socialcopsdev/camelot'
AUTHOR = 'Vinayak Mehta' AUTHOR = 'Vinayak Mehta'
AUTHOR_EMAIL = 'vinayak@socialcops.com' AUTHOR_EMAIL = 'vmehta94@gmail.com'
LICENSE = 'BSD License' LICENSE = 'BSD License'
opencv_min_version = '2.4.8' opencv_min_version = '2.4.8'
@ -58,18 +58,14 @@ def setup_package():
opencv_status = get_opencv_status() opencv_status = get_opencv_status()
opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version) opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
instructions = ("Installation instructions are available in the README at "
"https://github.com/socialcopsdev/camelot")
if opencv_status['up_to_date'] is False: if opencv_status['up_to_date'] is False:
if opencv_status['version']: if opencv_status['version']:
raise ImportError("Your installation of OpenCV " raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
"{0} is out-of-date.\n{1}{2}" .format(opencv_status['version'], opencv_req_str))
.format(opencv_status['version'],
opencv_req_str, instructions))
else: else:
raise ImportError("OpenCV is not installed.\n{0}{1}" raise ImportError("OpenCV is not installed.\n{}"
.format(opencv_req_str, instructions)) .format(opencv_req_str))
setup(**metadata) setup(**metadata)