Add docstrings and update docs
parent
1f71513004
commit
23ec6b55f7
|
|
@ -69,7 +69,7 @@ $ conda install -c conda-forge camelot-py
|
||||||
|
|
||||||
### Using pip
|
### Using pip
|
||||||
|
|
||||||
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
|
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
$ pip install camelot-py[cv]
|
$ pip install camelot-py[cv]
|
||||||
|
|
|
||||||
|
|
@ -1,18 +1,23 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
VERSION = (0, 4, 0)
|
VERSION = (0, 4, 0)
|
||||||
PHASE = 'alpha' # alpha, beta or rc
|
PRERELEASE = None # alpha, beta or rc
|
||||||
PHASE_VERSION = '1'
|
REVISION = None
|
||||||
|
|
||||||
|
|
||||||
|
def generate_version(version, prerelease=None, revision=None):
|
||||||
|
version_parts = ['.'.join(map(str, version))]
|
||||||
|
if prerelease is not None:
|
||||||
|
version_parts.append('-{}'.format(prerelease))
|
||||||
|
if revision is not None:
|
||||||
|
version_parts.append('.{}'.format(revision))
|
||||||
|
return ''.join(version_parts)
|
||||||
|
|
||||||
|
|
||||||
__title__ = 'camelot-py'
|
__title__ = 'camelot-py'
|
||||||
__description__ = 'PDF Table Extraction for Humans.'
|
__description__ = 'PDF Table Extraction for Humans.'
|
||||||
__url__ = 'http://camelot-py.readthedocs.io/'
|
__url__ = 'http://camelot-py.readthedocs.io/'
|
||||||
if PHASE:
|
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
|
||||||
__version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
|
|
||||||
if PHASE_VERSION:
|
|
||||||
__version__ = '{}.{}'.format(__version__, PHASE_VERSION)
|
|
||||||
else:
|
|
||||||
__version__ = '.'.join(map(str, VERSION))
|
|
||||||
__author__ = 'Vinayak Mehta'
|
__author__ = 'Vinayak Mehta'
|
||||||
__author_email__ = 'vmehta94@gmail.com'
|
__author_email__ = 'vmehta94@gmail.com'
|
||||||
__license__ = 'MIT License'
|
__license__ = 'MIT License'
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,29 @@ TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
||||||
class TextEdge(object):
|
class TextEdge(object):
|
||||||
|
"""Defines a text edge coordinates relative to a left-bottom
|
||||||
|
origin. (PDF coordinate space)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : float
|
||||||
|
x-coordinate of the text edge.
|
||||||
|
y0 : float
|
||||||
|
y-coordinate of bottommost point.
|
||||||
|
y1 : float
|
||||||
|
y-coordinate of topmost point.
|
||||||
|
align : string, optional (default: 'left')
|
||||||
|
{'left', 'right', 'middle'}
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
intersections: int
|
||||||
|
Number of intersections with horizontal text rows.
|
||||||
|
is_valid: bool
|
||||||
|
A text edge is valid if it intersections with at least
|
||||||
|
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
||||||
|
|
||||||
|
"""
|
||||||
def __init__(self, x, y0, y1, align='left'):
|
def __init__(self, x, y0, y1, align='left'):
|
||||||
self.x = x
|
self.x = x
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
|
|
@ -33,6 +56,9 @@ class TextEdge(object):
|
||||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||||
|
|
||||||
def update_coords(self, x, y0):
|
def update_coords(self, x, y0):
|
||||||
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
|
the is_valid attribute.
|
||||||
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
|
|
@ -44,11 +70,18 @@ class TextEdge(object):
|
||||||
|
|
||||||
|
|
||||||
class TextEdges(object):
|
class TextEdges(object):
|
||||||
|
"""Defines a dict of left, right and middle text edges found on
|
||||||
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
|
and each key's value is a list of camelot.core.TextEdge objects.
|
||||||
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._textedges = {'left': [], 'middle': [], 'right': []}
|
self._textedges = {'left': [], 'right': [], 'middle': []}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_x_coord(textline, align):
|
def get_x_coord(textline, align):
|
||||||
|
"""Returns the x coordinate of a text row based on the
|
||||||
|
specified alignment.
|
||||||
|
"""
|
||||||
x_left = textline.x0
|
x_left = textline.x0
|
||||||
x_right = textline.x1
|
x_right = textline.x1
|
||||||
x_middle = x_left + (x_right - x_left) / 2.0
|
x_middle = x_left + (x_right - x_left) / 2.0
|
||||||
|
|
@ -56,12 +89,17 @@ class TextEdges(object):
|
||||||
return x_coord[align]
|
return x_coord[align]
|
||||||
|
|
||||||
def find(self, x_coord, align):
|
def find(self, x_coord, align):
|
||||||
|
"""Returns the index of an existing text edge using
|
||||||
|
the specified x coordinate and alignment.
|
||||||
|
"""
|
||||||
for i, te in enumerate(self._textedges[align]):
|
for i, te in enumerate(self._textedges[align]):
|
||||||
if np.isclose(te.x, x_coord, atol=0.5):
|
if np.isclose(te.x, x_coord, atol=0.5):
|
||||||
return i
|
return i
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def add(self, textline, align):
|
def add(self, textline, align):
|
||||||
|
"""Adds a new text edge to the current dict.
|
||||||
|
"""
|
||||||
x = self.get_x_coord(textline, align)
|
x = self.get_x_coord(textline, align)
|
||||||
y0 = textline.y0
|
y0 = textline.y0
|
||||||
y1 = textline.y1
|
y1 = textline.y1
|
||||||
|
|
@ -69,6 +107,8 @@ class TextEdges(object):
|
||||||
self._textedges[align].append(te)
|
self._textedges[align].append(te)
|
||||||
|
|
||||||
def update(self, textline):
|
def update(self, textline):
|
||||||
|
"""Updates an existing text edge in the current dict.
|
||||||
|
"""
|
||||||
for align in ['left', 'right', 'middle']:
|
for align in ['left', 'right', 'middle']:
|
||||||
x_coord = self.get_x_coord(textline, align)
|
x_coord = self.get_x_coord(textline, align)
|
||||||
idx = self.find(x_coord, align)
|
idx = self.find(x_coord, align)
|
||||||
|
|
@ -78,11 +118,18 @@ class TextEdges(object):
|
||||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
|
"""Generates the text edges dict based on horizontal text
|
||||||
|
rows.
|
||||||
|
"""
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update(tl)
|
self.update(tl)
|
||||||
|
|
||||||
def get_relevant(self):
|
def get_relevant(self):
|
||||||
|
"""Returns the list of relevant text edges (all share the same
|
||||||
|
alignment) based on which list intersects horizontal text rows
|
||||||
|
the most.
|
||||||
|
"""
|
||||||
intersections_sum = {
|
intersections_sum = {
|
||||||
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
|
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
|
||||||
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
|
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
|
||||||
|
|
@ -96,6 +143,9 @@ class TextEdges(object):
|
||||||
return self._textedges[relevant_align]
|
return self._textedges[relevant_align]
|
||||||
|
|
||||||
def get_table_areas(self, textlines, relevant_textedges):
|
def get_table_areas(self, textlines, relevant_textedges):
|
||||||
|
"""Returns a dict of interesting table areas on the PDF page
|
||||||
|
calculated using relevant text edges.
|
||||||
|
"""
|
||||||
def pad(area, average_row_height):
|
def pad(area, average_row_height):
|
||||||
x0 = area[0] - TABLE_AREA_PADDING
|
x0 = area[0] - TABLE_AREA_PADDING
|
||||||
y0 = area[1] - TABLE_AREA_PADDING
|
y0 = area[1] - TABLE_AREA_PADDING
|
||||||
|
|
|
||||||
|
|
@ -247,10 +247,13 @@ class Stream(BaseParser):
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
# a general heuristic implementation of the table detection
|
"""A general implementation of the table detection algorithm
|
||||||
# algorithm described by Anssi Nurminen's master's thesis:
|
described by Anssi Nurminen's master's thesis.
|
||||||
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
||||||
# assumes that tables are situated relatively apart vertically
|
|
||||||
|
Assumes that tables are situated relatively far apart
|
||||||
|
vertically.
|
||||||
|
"""
|
||||||
|
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ If you're reading this, you're probably looking to contributing to Camelot. *Tim
|
||||||
|
|
||||||
This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.
|
This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.
|
||||||
|
|
||||||
.. _Vinayak Mehta: https://vinayak-mehta.github.io
|
.. _Vinayak Mehta: https://www.vinayakmehta.com
|
||||||
|
|
||||||
Code Of Conduct
|
Code Of Conduct
|
||||||
---------------
|
---------------
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,7 @@ This part of the documentation begins with some background information about why
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
user/intro
|
user/intro
|
||||||
|
user/install-deps
|
||||||
user/install
|
user/install
|
||||||
user/how-it-works
|
user/how-it-works
|
||||||
user/quickstart
|
user/quickstart
|
||||||
|
|
|
||||||
|
|
@ -5,24 +5,24 @@ How It Works
|
||||||
|
|
||||||
This part of the documentation includes a high-level explanation of how Camelot extracts tables from PDF files.
|
This part of the documentation includes a high-level explanation of how Camelot extracts tables from PDF files.
|
||||||
|
|
||||||
You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula`_.
|
You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula <https://github.com/tabulapdf/tabula>`_.
|
||||||
|
|
||||||
.. _Tabula: https://github.com/tabulapdf/tabula
|
|
||||||
|
|
||||||
.. _stream:
|
.. _stream:
|
||||||
|
|
||||||
Stream
|
Stream
|
||||||
------
|
------
|
||||||
|
|
||||||
Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It looks for these spaces between text to form a table representation.
|
Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins <https://euske.github.io/pdfminer/#tools>`_.
|
||||||
|
|
||||||
It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins`_. After getting the words on a page, it groups them into rows based on their *y* coordinates. It then tries to guess the number of columns the table might have by calculating the mode of the number of words in each row. This mode is used to calculate *x* ranges for the table's columns. It then adds columns to this column range list based on any words that may lie outside or inside the current column *x* ranges.
|
1. Words on the PDF page are grouped into text rows based on their *y* axis overlaps.
|
||||||
|
|
||||||
.. _margins: https://euske.github.io/pdfminer/#tools
|
2. Textedges are calculated and then used to guess interesting table areas on the PDF page. You can read `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_ to know more about this table detection technique. [See pages 20, 35 and 40]
|
||||||
|
|
||||||
.. note:: By default, Stream treats the whole PDF page as a table, which isn't ideal when there are more than two tables on a page with different number of columns. Automatic table detection for Stream is `in the works`_.
|
3. The number of columns inside each table area are then guessed. This is done by calculating the mode of number of words in each text row. Based on this mode, words in each text row are chosen to calculate a list of column *x* ranges.
|
||||||
|
|
||||||
.. _in the works: https://github.com/socialcopsdev/camelot/issues/102
|
4. Words that lie inside/outside the current column *x* ranges are then used to extend extend the current list of columns.
|
||||||
|
|
||||||
|
5. Finally, a table is formed using the text rows' *y* ranges and column *x* ranges and words found on the page are assigned to the table's cells based on their *x* and *y* coordinates.
|
||||||
|
|
||||||
.. _lattice:
|
.. _lattice:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
.. _install_deps:
|
||||||
|
|
||||||
|
Installation of dependencies
|
||||||
|
============================
|
||||||
|
|
||||||
|
The dependencies `Tkinter`_ and `ghostscript`_ can be installed using your system's package manager. You can run one of the following, based on your OS.
|
||||||
|
|
||||||
|
.. _Tkinter: https://wiki.python.org/moin/TkInter
|
||||||
|
.. _ghostscript: https://www.ghostscript.com
|
||||||
|
|
||||||
|
OS-specific instructions
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
For Ubuntu
|
||||||
|
^^^^^^^^^^
|
||||||
|
::
|
||||||
|
|
||||||
|
$ apt install python-tk ghostscript
|
||||||
|
|
||||||
|
Or for Python 3::
|
||||||
|
|
||||||
|
$ apt install python3-tk ghostscript
|
||||||
|
|
||||||
|
For macOS
|
||||||
|
^^^^^^^^^
|
||||||
|
::
|
||||||
|
|
||||||
|
$ brew install tcl-tk ghostscript
|
||||||
|
|
||||||
|
For Windows
|
||||||
|
^^^^^^^^^^^
|
||||||
|
|
||||||
|
For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
|
||||||
|
|
||||||
|
After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
|
||||||
|
|
||||||
|
.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
|
||||||
|
.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
|
||||||
|
.. _as shown here: https://java.com/en/download/help/path.xml
|
||||||
|
|
||||||
|
Checks to see if dependencies were installed correctly
|
||||||
|
------------------------------------------------------
|
||||||
|
|
||||||
|
You can do the following checks to see if the dependencies were installed correctly.
|
||||||
|
|
||||||
|
For Tkinter
|
||||||
|
^^^^^^^^^^^
|
||||||
|
|
||||||
|
Launch Python, and then at the prompt, type::
|
||||||
|
|
||||||
|
>>> import Tkinter
|
||||||
|
|
||||||
|
Or in Python 3::
|
||||||
|
|
||||||
|
>>> import tkinter
|
||||||
|
|
||||||
|
If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
|
||||||
|
|
||||||
|
For ghostscript
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Run the following to check the ghostscript version.
|
||||||
|
|
||||||
|
For Ubuntu/macOS::
|
||||||
|
|
||||||
|
$ gs -version
|
||||||
|
|
||||||
|
For Windows::
|
||||||
|
|
||||||
|
C:\> gswin64c.exe -version
|
||||||
|
|
||||||
|
Or for Windows 32-bit::
|
||||||
|
|
||||||
|
C:\> gswin32c.exe -version
|
||||||
|
|
||||||
|
If you have ghostscript, you should see the ghostscript version and copyright information.
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
Installation of Camelot
|
Installation of Camelot
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
This part of the documentation covers how to install Camelot.
|
This part of the documentation covers the steps to install Camelot.
|
||||||
|
|
||||||
Using conda
|
Using conda
|
||||||
-----------
|
-----------
|
||||||
|
|
@ -23,84 +23,17 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
|
||||||
Using pip
|
Using pip
|
||||||
---------
|
---------
|
||||||
|
|
||||||
First, you'll need to install the dependencies, which include `Tkinter`_ and `ghostscript`_.
|
After :ref:`installing the dependencies <install_deps>`, which include `Tkinter`_ and `ghostscript`_, you can simply use pip to install Camelot::
|
||||||
|
|
||||||
|
$ pip install camelot-py[cv]
|
||||||
|
|
||||||
.. _Tkinter: https://wiki.python.org/moin/TkInter
|
.. _Tkinter: https://wiki.python.org/moin/TkInter
|
||||||
.. _ghostscript: https://www.ghostscript.com
|
.. _ghostscript: https://www.ghostscript.com
|
||||||
|
|
||||||
These can be installed using your system's package manager. You can run one of the following, based on your OS.
|
|
||||||
|
|
||||||
For Ubuntu
|
|
||||||
^^^^^^^^^^
|
|
||||||
::
|
|
||||||
|
|
||||||
$ apt install python-tk ghostscript
|
|
||||||
|
|
||||||
Or for Python 3::
|
|
||||||
|
|
||||||
$ apt install python3-tk ghostscript
|
|
||||||
|
|
||||||
For macOS
|
|
||||||
^^^^^^^^^
|
|
||||||
::
|
|
||||||
|
|
||||||
$ brew install tcl-tk ghostscript
|
|
||||||
|
|
||||||
For Windows
|
|
||||||
^^^^^^^^^^^
|
|
||||||
|
|
||||||
For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
|
|
||||||
|
|
||||||
After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
|
|
||||||
|
|
||||||
.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
|
|
||||||
.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
|
|
||||||
.. _as shown here: https://java.com/en/download/help/path.xml
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
You can do the following checks to see if the dependencies were installed correctly.
|
|
||||||
|
|
||||||
For Tkinter
|
|
||||||
^^^^^^^^^^^
|
|
||||||
|
|
||||||
Launch Python, and then at the prompt, type::
|
|
||||||
|
|
||||||
>>> import Tkinter
|
|
||||||
|
|
||||||
Or in Python 3::
|
|
||||||
|
|
||||||
>>> import tkinter
|
|
||||||
|
|
||||||
If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
|
|
||||||
|
|
||||||
For ghostscript
|
|
||||||
^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Run the following to check the ghostscript version.
|
|
||||||
|
|
||||||
For Ubuntu/macOS::
|
|
||||||
|
|
||||||
$ gs -version
|
|
||||||
|
|
||||||
For Windows::
|
|
||||||
|
|
||||||
C:\> gswin64c.exe -version
|
|
||||||
|
|
||||||
Or for Windows 32-bit::
|
|
||||||
|
|
||||||
C:\> gswin32c.exe -version
|
|
||||||
|
|
||||||
If you have ghostscript, you should see the ghostscript version and copyright information.
|
|
||||||
|
|
||||||
Finally, you can use pip to install Camelot::
|
|
||||||
|
|
||||||
$ pip install camelot-py[cv]
|
|
||||||
|
|
||||||
From the source code
|
From the source code
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
After `installing the dependencies`_, you can install from the source by:
|
After :ref:`installing the dependencies <install_deps>`, you can install from the source by:
|
||||||
|
|
||||||
1. Cloning the GitHub repository.
|
1. Cloning the GitHub repository.
|
||||||
::
|
::
|
||||||
|
|
@ -112,5 +45,3 @@ After `installing the dependencies`_, you can install from the source by:
|
||||||
|
|
||||||
$ cd camelot
|
$ cd camelot
|
||||||
$ pip install ".[cv]"
|
$ pip install ".[cv]"
|
||||||
|
|
||||||
.. _installing the dependencies: https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue