Add utf8 header

pull/2/head
Vinayak Mehta 2018-09-24 16:27:26 +05:30
parent 3600025a22
commit be2733ebd2
14 changed files with 158 additions and 154 deletions

View File

@ -1,4 +1,4 @@
# Camelot: PDF Table Parsing for Humans
# Camelot: PDF Table Extraction for Humans
![license](https://img.shields.io/badge/license-MIT-lightgrey.svg) ![python-version](https://img.shields.io/badge/python-2.7-blue.svg)
@ -38,7 +38,7 @@
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
There's a [command-line interface]() too!
There's a [command-line interface](http://camelot-py.readthedocs.io/en/master/user/cli.html) too!
## Why Camelot?
@ -46,13 +46,12 @@ There's a [command-line interface]() too!
- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table.
- Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
- **Export** to multiple formats, including json, excel and html.
- Simple and Elegant API, written in **Python**!
See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools).
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
## Installation
After [installing the dependencies](), you can simply use pip to install Camelot:
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/), you can simply use pip to install Camelot:
<pre>
$ pip install camelot-py
@ -60,7 +59,7 @@ $ pip install camelot-py
### Alternatively
You can install the dependencies [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/) using your system's package manager. After that, clone the repo using:
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), clone the repo using:
<pre>
$ git clone https://www.github.com/socialcopsdev/camelot
@ -77,7 +76,7 @@ Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't wan
## Documentation
Great documentation is available at [insert link]().
Great documentation is available at [insert link](http://camelot-py.readthedocs.io/).
## Development

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from .__version__ import __version__
from .io import read_pdf

View File

@ -1,3 +1,11 @@
# -*- coding: utf-8 -*-
VERSION = (0, 1, 0)
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION))
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License'

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from pprint import pprint
import click
@ -20,23 +21,22 @@ pass_config = click.make_pass_decorator(Config)
@click.group()
@click.version_option(version=__version__)
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers'
' to parse. Example: 1,3,4 or 1,4-end')
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' Example: 1,3,4 or 1,4-end.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html']),
help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Whether or not to create a ZIP'
' archive.')
@click.option('-split', '--split_text', is_flag=True, help='Whether or not to'
' split text if it spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='(inactive) Whether or'
' not to flag text which has uncommon size. (Useful to detect'
' super/subscripts)')
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
@click.option('-split', '--split_text', is_flag=True,
help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
' font size. Useful to detect super/subscripts.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
help='char_margin, line_margin, word_margin for PDFMiner.')
help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context
def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans"""
ctx.obj = Config()
for key, value in kwargs.iteritems():
ctx.obj.set_config(key, value)
@ -44,45 +44,42 @@ def cli(ctx, *args, **kwargs):
@cli.command('lattice')
@click.option('-T', '--table_area', default=[], multiple=True,
help='Table areas (x1,y1,x2,y2) to process.\n'
' x1, y1 -> left-top and x2, y2 -> right-bottom')
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
help='Whether or not to process lines that are in'
' background.')
help='Process background lines.')
@click.option('-scale', '--line_size_scaling', default=15,
help='Factor by which the page dimensions will be'
' divided to get smallest length of detected lines.')
help='Line size scaling factor. The larger the value,'
' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
multiple=True, help='Specify direction'
' in which text will be copied over in a spanning cell.')
multiple=True, help='Direction in which text in a spanning cell'
' will be copied over.')
@click.option('-shift', '--shift_text', default=['l', 't'],
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
help='Specify direction in which text in a spanning'
' cell should flow.')
help='Direction in which text in a spanning cell will flow.')
@click.option('-l', '--line_close_tol', default=2,
help='Tolerance parameter used to merge close vertical'
' lines and close horizontal lines.')
' and horizontal lines.')
@click.option('-j', '--joint_close_tol', default=2,
help='Tolerance parameter used to decide whether'
' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15,
help='For adaptive thresholding, size of a pixel'
' neighborhood that is used to calculate a threshold value for'
' the pixel: 3, 5, 7, and so on.')
' the pixel. Example: 3, 5, 7, and so on.')
@click.option('-const', '--threshold_constant', default=-2,
help='For adaptive thresholding, constant subtracted'
' from the mean or weighted mean.\nNormally, it is positive but'
' from the mean or weighted mean. Normally, it is positive but'
' may be zero or negative as well.')
@click.option('-I', '--iterations', default=0,
help='Number of times for erosion/dilation is'
' applied.')
help='Number of times for erosion/dilation will be applied.')
@click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
help='Plot geometry found on PDF page for debugging.')
help='Plot geometry found on PDF page, for debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@pass_config
def lattice(c, *args, **kwargs):
"""Use lines between text to parse table."""
"""Use lines between text to parse the table."""
conf = c.config
pages = conf.pop('pages')
output = conf.pop('output')
@ -107,27 +104,27 @@ def lattice(c, *args, **kwargs):
if output is None:
raise click.UsageError('Please specify output file path using --output')
if f is None:
raise click.UsageError('Please specify output format using --format')
raise click.UsageError('Please specify output file format using --format')
tables.export(output, f=f, compress=compress)
@cli.command('stream')
@click.option('-T', '--table_area', default=[], multiple=True,
help='Table areas (x1,y1,x2,y2) to process.\n'
' x1, y1 -> left-top and x2, y2 -> right-bottom')
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True,
help='x-coordinates of column separators.')
@click.option('-r', '--row_close_tol', default=2, help='Rows will be'
' formed by combining text vertically within this tolerance.')
@click.option('-c', '--col_close_tol', default=0, help='Columns will'
' be formed by combining text horizontally within this tolerance.')
help='X coordinates of column separators.')
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table']),
help='Plot geometry found on PDF page for debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@pass_config
def stream(c, *args, **kwargs):
"""Use spaces between text to parse table."""
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop('pages')
output = conf.pop('output')
@ -151,5 +148,5 @@ def stream(c, *args, **kwargs):
if output is None:
raise click.UsageError('Please specify output file path using --output')
if f is None:
raise click.UsageError('Please specify output format using --format')
raise click.UsageError('Please specify output file format using --format')
tables.export(output, f=f, compress=compress)

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
import json
import zipfile
@ -11,7 +13,7 @@ from .plotting import *
class Cell(object):
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (pdf coordinate space)
left-bottom origin. (PDF coordinate space)
Parameters
----------
@ -89,7 +91,7 @@ class Cell(object):
class Table(object):
"""Defines a table with coordinates relative to a left-bottom
origin. (pdf coordinate space)
origin. (PDF coordinate space)
Parameters
----------
@ -110,9 +112,9 @@ class Table(object):
whitespace : float
Percentage of whitespace in the table.
order : int
Table number on pdf page.
Table number on PDF page.
page : int
Pdf page number.
PDF page number.
"""
def __init__(self, cols, rows):

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -10,16 +12,16 @@ from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
file into single page pdfs, parsing each pdf and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
Parameters
----------
filename : str
Path to pdf file.
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
"""
def __init__(self, filename, pages='1'):
@ -34,10 +36,10 @@ class PDFHandler(object):
Parameters
----------
filename : str
Path to pdf file.
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Returns
-------
@ -67,16 +69,16 @@ class PDFHandler(object):
return sorted(set(P))
def _save_page(self, filename, page, temp):
"""Saves specified page from pdf into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filename : str
Path to pdf file.
Path to PDF file.
page : int
Page number
Page number.
temp : str
Tmp directory
Tmp directory.
"""
with open(filename, 'rb') as fileobj:
@ -91,7 +93,7 @@ class PDFHandler(object):
with open(fpath, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated pdf
# fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
@ -114,7 +116,7 @@ class PDFHandler(object):
def parse(self, flavor='lattice', **kwargs):
"""Extracts tables by calling parser.get_tables on all single
page pdfs.
page PDFs.
Parameters
----------
@ -127,10 +129,10 @@ class PDFHandler(object):
Returns
-------
tables : camelot.core.TableList
List of tables found in pdf.
List of tables found in PDF.
geometry : camelot.core.GeometryList
List of geometry objects (contours, lines, joints)
found in pdf.
List of geometry objects (contours, lines, joints) found
in PDF.
"""
tables = []

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division
from itertools import groupby
from operator import itemgetter

View File

@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-
from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
"""Read PDF and return parsed data tables.
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with flavor='lattice'.
@ -11,53 +13,47 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
Parameters
----------
filepath : str
Path to pdf file.
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
table_area : list, optional (default: None)
List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
Tolerance parameter used to combine text vertically,
to generate rows.
col_close_tol^ : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
Tolerance parameter used to combine text horizontally,
to generate columns.
process_background* : bool, optional (default: False)
Whether or not to process lines that are in background.
Process background lines.
line_size_scaling* : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Direction in which text in a spanning cell will be copied
over.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
Direction in which text in a spanning cell will flow.
line_close_tol* : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
@ -76,7 +72,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,2 +1,4 @@
# -*- coding: utf-8 -*-
from .stream import Stream
from .lattice import Lattice

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
from ..utils import get_page_layout, get_text_objects

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division
import os
import copy
@ -21,41 +23,35 @@ logger = setup_logging(__name__)
class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text
to parse table.
to parse the table.
Parameters
----------
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
process_background : bool, optional (default: False)
Whether or not to process lines that are in background.
Process background lines.
line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Direction in which text in a spanning cell will be copied
over.
shift_text : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
Direction in which text in a spanning cell will flow.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
line_close_tol : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_close_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
@ -74,7 +70,7 @@ class Lattice(BaseParser):
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division
import os
import logging
@ -16,7 +18,7 @@ logger = setup_logging(__name__)
class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text
to parse table.
to parse the table.
If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal.
@ -24,27 +26,25 @@ class Stream(BaseParser):
Parameters
----------
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
row_close_tol : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
Tolerance parameter used to combine text vertically,
to generate rows.
col_close_tol : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
Tolerance parameter used to combine text horizontally,
to generate columns.
margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner margins. (char_margin, line_margin, word_margin)
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
from setuptools import find_packages
from pkg_resources import parse_version
@ -8,16 +10,8 @@ about = {}
with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
exec(f.read(), about)
# TODO: Move these to __version__.py
NAME = 'camelot-py'
VERSION = about['__version__']
DESCRIPTION = 'PDF Table Parsing for Humans'
with open('README.md') as f:
LONG_DESCRIPTION = f.read()
URL = 'https://github.com/socialcopsdev/camelot'
AUTHOR = 'Vinayak Mehta'
AUTHOR_EMAIL = 'vmehta94@gmail.com'
LICENSE = 'MIT License'
with open('README.md', 'r') as f:
readme = f.read()
def setup_package():
@ -31,14 +25,14 @@ def setup_package():
for line in f:
dev_reqs.append(line.strip())
metadata = dict(name=NAME,
version=VERSION,
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
url=URL,
author=AUTHOR,
author_email=AUTHOR_EMAIL,
license=LICENSE,
metadata = dict(name=about['__title__'],
version=about['__version__'],
description=about['__description__'],
long_description=readme,
url=about['__url__'],
author=about['__author__'],
author_email=about['__author_email__'],
license=about['__license__'],
packages=find_packages(exclude=('tests',)),
install_requires=reqs,
extras_require={

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
import pandas as pd