Add utf8 header
parent
3600025a22
commit
be2733ebd2
13
README.md
13
README.md
|
|
@ -1,4 +1,4 @@
|
||||||
# Camelot: PDF Table Parsing for Humans
|
# Camelot: PDF Table Extraction for Humans
|
||||||
|
|
||||||
 
|
 
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@
|
||||||
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
|
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
|
||||||
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
|
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
|
||||||
|
|
||||||
There's a [command-line interface]() too!
|
There's a [command-line interface](http://camelot-py.readthedocs.io/en/master/user/cli.html) too!
|
||||||
|
|
||||||
## Why Camelot?
|
## Why Camelot?
|
||||||
|
|
||||||
|
|
@ -46,13 +46,12 @@ There's a [command-line interface]() too!
|
||||||
- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table.
|
- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table.
|
||||||
- Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
|
- Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
|
||||||
- **Export** to multiple formats, including json, excel and html.
|
- **Export** to multiple formats, including json, excel and html.
|
||||||
- Simple and Elegant API, written in **Python**!
|
|
||||||
|
|
||||||
See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools).
|
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
After [installing the dependencies](), you can simply use pip to install Camelot:
|
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/), you can simply use pip to install Camelot:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
$ pip install camelot-py
|
$ pip install camelot-py
|
||||||
|
|
@ -60,7 +59,7 @@ $ pip install camelot-py
|
||||||
|
|
||||||
### Alternatively
|
### Alternatively
|
||||||
|
|
||||||
You can install the dependencies [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/) using your system's package manager. After that, clone the repo using:
|
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), clone the repo using:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
$ git clone https://www.github.com/socialcopsdev/camelot
|
$ git clone https://www.github.com/socialcopsdev/camelot
|
||||||
|
|
@ -77,7 +76,7 @@ Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't wan
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
Great documentation is available at [insert link]().
|
Great documentation is available at [insert link](http://camelot-py.readthedocs.io/).
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
|
|
@ -1,3 +1,11 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
VERSION = (0, 1, 0)
|
VERSION = (0, 1, 0)
|
||||||
|
|
||||||
|
__title__ = 'camelot-py'
|
||||||
|
__description__ = 'PDF Table Extraction for Humans.'
|
||||||
|
__url__ = 'http://camelot-py.readthedocs.io/'
|
||||||
__version__ = '.'.join(map(str, VERSION))
|
__version__ = '.'.join(map(str, VERSION))
|
||||||
|
__author__ = 'Vinayak Mehta'
|
||||||
|
__author_email__ = 'vmehta94@gmail.com'
|
||||||
|
__license__ = 'MIT License'
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
@ -20,23 +21,22 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers'
|
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||||
' to parse. Example: 1,3,4 or 1,4-end')
|
' Example: 1,3,4 or 1,4-end.')
|
||||||
@click.option('-o', '--output', help='Output filepath.')
|
@click.option('-o', '--output', help='Output file path.')
|
||||||
@click.option('-f', '--format',
|
@click.option('-f', '--format',
|
||||||
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
||||||
help='Output file format.')
|
help='Output file format.')
|
||||||
@click.option('-z', '--zip', is_flag=True, help='Whether or not to create a ZIP'
|
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
||||||
' archive.')
|
@click.option('-split', '--split_text', is_flag=True,
|
||||||
@click.option('-split', '--split_text', is_flag=True, help='Whether or not to'
|
help='Split text that spans across multiple cells.')
|
||||||
' split text if it spans across multiple cells.')
|
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||||
@click.option('-flag', '--flag_size', is_flag=True, help='(inactive) Whether or'
|
' font size. Useful to detect super/subscripts.')
|
||||||
' not to flag text which has uncommon size. (Useful to detect'
|
|
||||||
' super/subscripts)')
|
|
||||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help='char_margin, line_margin, word_margin for PDFMiner.')
|
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def cli(ctx, *args, **kwargs):
|
def cli(ctx, *args, **kwargs):
|
||||||
|
"""Camelot: PDF Table Extraction for Humans"""
|
||||||
ctx.obj = Config()
|
ctx.obj = Config()
|
||||||
for key, value in kwargs.iteritems():
|
for key, value in kwargs.iteritems():
|
||||||
ctx.obj.set_config(key, value)
|
ctx.obj.set_config(key, value)
|
||||||
|
|
@ -44,45 +44,42 @@ def cli(ctx, *args, **kwargs):
|
||||||
|
|
||||||
@cli.command('lattice')
|
@cli.command('lattice')
|
||||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
@click.option('-T', '--table_area', default=[], multiple=True,
|
||||||
help='Table areas (x1,y1,x2,y2) to process.\n'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' x1, y1 -> left-top and x2, y2 -> right-bottom')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-back', '--process_background', is_flag=True,
|
@click.option('-back', '--process_background', is_flag=True,
|
||||||
help='Whether or not to process lines that are in'
|
help='Process background lines.')
|
||||||
' background.')
|
|
||||||
@click.option('-scale', '--line_size_scaling', default=15,
|
@click.option('-scale', '--line_size_scaling', default=15,
|
||||||
help='Factor by which the page dimensions will be'
|
help='Line size scaling factor. The larger the value,'
|
||||||
' divided to get smallest length of detected lines.')
|
' the smaller the detected lines.')
|
||||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
||||||
multiple=True, help='Specify direction'
|
multiple=True, help='Direction in which text in a spanning cell'
|
||||||
' in which text will be copied over in a spanning cell.')
|
' will be copied over.')
|
||||||
@click.option('-shift', '--shift_text', default=['l', 't'],
|
@click.option('-shift', '--shift_text', default=['l', 't'],
|
||||||
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
||||||
help='Specify direction in which text in a spanning'
|
help='Direction in which text in a spanning cell will flow.')
|
||||||
' cell should flow.')
|
|
||||||
@click.option('-l', '--line_close_tol', default=2,
|
@click.option('-l', '--line_close_tol', default=2,
|
||||||
help='Tolerance parameter used to merge close vertical'
|
help='Tolerance parameter used to merge close vertical'
|
||||||
' lines and close horizontal lines.')
|
' and horizontal lines.')
|
||||||
@click.option('-j', '--joint_close_tol', default=2,
|
@click.option('-j', '--joint_close_tol', default=2,
|
||||||
help='Tolerance parameter used to decide whether'
|
help='Tolerance parameter used to decide whether'
|
||||||
' the detected lines and points lie close to each other.')
|
' the detected lines and points lie close to each other.')
|
||||||
@click.option('-block', '--threshold_blocksize', default=15,
|
@click.option('-block', '--threshold_blocksize', default=15,
|
||||||
help='For adaptive thresholding, size of a pixel'
|
help='For adaptive thresholding, size of a pixel'
|
||||||
' neighborhood that is used to calculate a threshold value for'
|
' neighborhood that is used to calculate a threshold value for'
|
||||||
' the pixel: 3, 5, 7, and so on.')
|
' the pixel. Example: 3, 5, 7, and so on.')
|
||||||
@click.option('-const', '--threshold_constant', default=-2,
|
@click.option('-const', '--threshold_constant', default=-2,
|
||||||
help='For adaptive thresholding, constant subtracted'
|
help='For adaptive thresholding, constant subtracted'
|
||||||
' from the mean or weighted mean.\nNormally, it is positive but'
|
' from the mean or weighted mean. Normally, it is positive but'
|
||||||
' may be zero or negative as well.')
|
' may be zero or negative as well.')
|
||||||
@click.option('-I', '--iterations', default=0,
|
@click.option('-I', '--iterations', default=0,
|
||||||
help='Number of times for erosion/dilation is'
|
help='Number of times for erosion/dilation will be applied.')
|
||||||
' applied.')
|
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
|
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
|
||||||
help='Plot geometry found on PDF page for debugging.')
|
help='Plot geometry found on PDF page, for debugging.')
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
@click.argument('filepath', type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
def lattice(c, *args, **kwargs):
|
def lattice(c, *args, **kwargs):
|
||||||
"""Use lines between text to parse table."""
|
"""Use lines between text to parse the table."""
|
||||||
conf = c.config
|
conf = c.config
|
||||||
pages = conf.pop('pages')
|
pages = conf.pop('pages')
|
||||||
output = conf.pop('output')
|
output = conf.pop('output')
|
||||||
|
|
@ -105,29 +102,29 @@ def lattice(c, *args, **kwargs):
|
||||||
table.plot(plot_type)
|
table.plot(plot_type)
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output filepath using --output')
|
raise click.UsageError('Please specify output file path using --output')
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output format using --format')
|
raise click.UsageError('Please specify output file format using --format')
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
@cli.command('stream')
|
@cli.command('stream')
|
||||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
@click.option('-T', '--table_area', default=[], multiple=True,
|
||||||
help='Table areas (x1,y1,x2,y2) to process.\n'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' x1, y1 -> left-top and x2, y2 -> right-bottom')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-C', '--columns', default=[], multiple=True,
|
@click.option('-C', '--columns', default=[], multiple=True,
|
||||||
help='x-coordinates of column separators.')
|
help='X coordinates of column separators.')
|
||||||
@click.option('-r', '--row_close_tol', default=2, help='Rows will be'
|
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
||||||
' formed by combining text vertically within this tolerance.')
|
' used to combine text vertically, to generate rows.')
|
||||||
@click.option('-c', '--col_close_tol', default=0, help='Columns will'
|
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||||
' be formed by combining text horizontally within this tolerance.')
|
' used to combine text horizontally, to generate columns.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'table']),
|
type=click.Choice(['text', 'table']),
|
||||||
help='Plot geometry found on PDF page for debugging.')
|
help='Plot geometry found on PDF page for debugging.')
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
@click.argument('filepath', type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
def stream(c, *args, **kwargs):
|
def stream(c, *args, **kwargs):
|
||||||
"""Use spaces between text to parse table."""
|
"""Use spaces between text to parse the table."""
|
||||||
conf = c.config
|
conf = c.config
|
||||||
pages = conf.pop('pages')
|
pages = conf.pop('pages')
|
||||||
output = conf.pop('output')
|
output = conf.pop('output')
|
||||||
|
|
@ -149,7 +146,7 @@ def stream(c, *args, **kwargs):
|
||||||
table.plot(plot_type)
|
table.plot(plot_type)
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output filepath using --output')
|
raise click.UsageError('Please specify output file path using --output')
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output format using --format')
|
raise click.UsageError('Please specify output file format using --format')
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
@ -11,7 +13,7 @@ from .plotting import *
|
||||||
|
|
||||||
class Cell(object):
|
class Cell(object):
|
||||||
"""Defines a cell in a table with coordinates relative to a
|
"""Defines a cell in a table with coordinates relative to a
|
||||||
left-bottom origin. (pdf coordinate space)
|
left-bottom origin. (PDF coordinate space)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -89,7 +91,7 @@ class Cell(object):
|
||||||
|
|
||||||
class Table(object):
|
class Table(object):
|
||||||
"""Defines a table with coordinates relative to a left-bottom
|
"""Defines a table with coordinates relative to a left-bottom
|
||||||
origin. (pdf coordinate space)
|
origin. (PDF coordinate space)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -110,9 +112,9 @@ class Table(object):
|
||||||
whitespace : float
|
whitespace : float
|
||||||
Percentage of whitespace in the table.
|
Percentage of whitespace in the table.
|
||||||
order : int
|
order : int
|
||||||
Table number on pdf page.
|
Table number on PDF page.
|
||||||
page : int
|
page : int
|
||||||
Pdf page number.
|
PDF page number.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
@ -10,16 +12,16 @@ from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
"""Handles all operations like temp directory creation, splitting
|
"""Handles all operations like temp directory creation, splitting
|
||||||
file into single page pdfs, parsing each pdf and then removing the
|
file into single page PDFs, parsing each PDF and then removing the
|
||||||
temp directory.
|
temp directory.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, pages='1'):
|
def __init__(self, filename, pages='1'):
|
||||||
|
|
@ -34,10 +36,10 @@ class PDFHandler(object):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -67,16 +69,16 @@ class PDFHandler(object):
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filename, page, temp):
|
def _save_page(self, filename, page, temp):
|
||||||
"""Saves specified page from pdf into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to PDF file.
|
||||||
page : int
|
page : int
|
||||||
Page number
|
Page number.
|
||||||
temp : str
|
temp : str
|
||||||
Tmp directory
|
Tmp directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filename, 'rb') as fileobj:
|
||||||
|
|
@ -91,7 +93,7 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, dim = get_page_layout(fpath)
|
||||||
# fix rotated pdf
|
# fix rotated PDF
|
||||||
lttextlh = get_text_objects(layout, ltype="lh")
|
lttextlh = get_text_objects(layout, ltype="lh")
|
||||||
lttextlv = get_text_objects(layout, ltype="lv")
|
lttextlv = get_text_objects(layout, ltype="lv")
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
|
|
@ -114,7 +116,7 @@ class PDFHandler(object):
|
||||||
|
|
||||||
def parse(self, flavor='lattice', **kwargs):
|
def parse(self, flavor='lattice', **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page pdfs.
|
page PDFs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -127,10 +129,10 @@ class PDFHandler(object):
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
List of tables found in pdf.
|
List of tables found in PDF.
|
||||||
geometry : camelot.core.GeometryList
|
geometry : camelot.core.GeometryList
|
||||||
List of geometry objects (contours, lines, joints)
|
List of geometry objects (contours, lines, joints) found
|
||||||
found in pdf.
|
in PDF.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .handlers import PDFHandler
|
from .handlers import PDFHandler
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||||
"""Read PDF and return parsed data tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
and kwargs annotated with * can only be used with flavor='lattice'.
|
and kwargs annotated with * can only be used with flavor='lattice'.
|
||||||
|
|
@ -11,53 +13,47 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Path to pdf file.
|
Path to PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end.
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice' or 'stream').
|
The parsing method to use ('lattice' or 'stream').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to process as strings of the form
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
in PDF coordinate space.
|
||||||
columns^ : list, optional (default: None)
|
columns^ : list, optional (default: None)
|
||||||
List of column x-coordinates as strings where the coordinates
|
List of column x-coordinates strings where the coordinates
|
||||||
are comma-separated.
|
are comma-separated.
|
||||||
split_text : bool, optional (default: False)
|
split_text : bool, optional (default: False)
|
||||||
Whether or not to split a text line if it spans across
|
Split text that spans across multiple cells.
|
||||||
multiple cells.
|
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Flag text based on font size. Useful to detect
|
||||||
if its size is different from rest of the string. (Useful for
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
super and subscripts)
|
|
||||||
row_close_tol^ : int, optional (default: 2)
|
row_close_tol^ : int, optional (default: 2)
|
||||||
Rows will be formed by combining text vertically
|
Tolerance parameter used to combine text vertically,
|
||||||
within this tolerance.
|
to generate rows.
|
||||||
col_close_tol^ : int, optional (default: 0)
|
col_close_tol^ : int, optional (default: 0)
|
||||||
Columns will be formed by combining text horizontally
|
Tolerance parameter used to combine text horizontally,
|
||||||
within this tolerance.
|
to generate columns.
|
||||||
process_background* : bool, optional (default: False)
|
process_background* : bool, optional (default: False)
|
||||||
Whether or not to process lines that are in background.
|
Process background lines.
|
||||||
line_size_scaling* : int, optional (default: 15)
|
line_size_scaling* : int, optional (default: 15)
|
||||||
Factor by which the page dimensions will be divided to get
|
Line size scaling factor. The larger the value the smaller
|
||||||
smallest length of lines that should be detected.
|
the detected lines. Making it very large will lead to text
|
||||||
|
being detected as lines.
|
||||||
The larger this value, smaller the detected lines. Making it
|
|
||||||
too large will lead to text being detected as lines.
|
|
||||||
copy_text* : list, optional (default: None)
|
copy_text* : list, optional (default: None)
|
||||||
{'h', 'v'}
|
{'h', 'v'}
|
||||||
Select one or more strings from above and pass them as a list
|
Direction in which text in a spanning cell will be copied
|
||||||
to specify the direction in which text should be copied over
|
over.
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
shift_text* : list, optional (default: ['l', 't'])
|
shift_text* : list, optional (default: ['l', 't'])
|
||||||
{'l', 'r', 't', 'b'}
|
{'l', 'r', 't', 'b'}
|
||||||
Select one or more strings from above and pass them as a list
|
Direction in which text in a spanning cell will flow.
|
||||||
to specify where the text in a spanning cell should flow.
|
|
||||||
line_close_tol* : int, optional (default: 2)
|
line_close_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to merge vertical and horizontal
|
Tolerance parameter used to merge close vertical and horizontal
|
||||||
detected lines which lie close to each other.
|
lines.
|
||||||
joint_close_tol* : int, optional (default: 2)
|
joint_close_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
|
|
@ -76,7 +72,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
margins : tuple
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner char_margin, line_margin and word_margin.
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,4 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .stream import Stream
|
from .stream import Stream
|
||||||
from .lattice import Lattice
|
from .lattice import Lattice
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..utils import get_page_layout, get_text_objects
|
from ..utils import get_page_layout, get_text_objects
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import copy
|
import copy
|
||||||
|
|
@ -21,41 +23,35 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""Lattice method of parsing looks for lines between text
|
"""Lattice method of parsing looks for lines between text
|
||||||
to parse table.
|
to parse the table.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to analyze as strings of the form
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
in PDF coordinate space.
|
||||||
process_background : bool, optional (default: False)
|
process_background : bool, optional (default: False)
|
||||||
Whether or not to process lines that are in background.
|
Process background lines.
|
||||||
line_size_scaling : int, optional (default: 15)
|
line_size_scaling : int, optional (default: 15)
|
||||||
Factor by which the page dimensions will be divided to get
|
Line size scaling factor. The larger the value the smaller
|
||||||
smallest length of lines that should be detected.
|
the detected lines. Making it very large will lead to text
|
||||||
|
being detected as lines.
|
||||||
The larger this value, smaller the detected lines. Making it
|
|
||||||
too large will lead to text being detected as lines.
|
|
||||||
copy_text : list, optional (default: None)
|
copy_text : list, optional (default: None)
|
||||||
{'h', 'v'}
|
{'h', 'v'}
|
||||||
Select one or more strings from above and pass them as a list
|
Direction in which text in a spanning cell will be copied
|
||||||
to specify the direction in which text should be copied over
|
over.
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
shift_text : list, optional (default: ['l', 't'])
|
shift_text : list, optional (default: ['l', 't'])
|
||||||
{'l', 'r', 't', 'b'}
|
{'l', 'r', 't', 'b'}
|
||||||
Select one or more strings from above and pass them as a list
|
Direction in which text in a spanning cell will flow.
|
||||||
to specify where the text in a spanning cell should flow.
|
|
||||||
split_text : bool, optional (default: False)
|
split_text : bool, optional (default: False)
|
||||||
Whether or not to split a text line if it spans across
|
Split text that spans across multiple cells.
|
||||||
multiple cells.
|
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Flag text based on font size. Useful to detect
|
||||||
if its size is different from rest of the string. (Useful for
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
super and subscripts)
|
|
||||||
line_close_tol : int, optional (default: 2)
|
line_close_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to merge vertical and horizontal
|
Tolerance parameter used to merge close vertical and horizontal
|
||||||
detected lines which lie close to each other.
|
lines.
|
||||||
joint_close_tol : int, optional (default: 2)
|
joint_close_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
|
|
@ -74,7 +70,7 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
margins : tuple
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner char_margin, line_margin and word_margin.
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -16,7 +18,7 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
"""Stream method of parsing looks for spaces between text
|
"""Stream method of parsing looks for spaces between text
|
||||||
to parse table.
|
to parse the table.
|
||||||
|
|
||||||
If you want to specify columns when specifying multiple table
|
If you want to specify columns when specifying multiple table
|
||||||
areas, make sure that the length of both lists are equal.
|
areas, make sure that the length of both lists are equal.
|
||||||
|
|
@ -24,27 +26,25 @@ class Stream(BaseParser):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to analyze as strings of the form
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
in PDF coordinate space.
|
||||||
columns : list, optional (default: None)
|
columns : list, optional (default: None)
|
||||||
List of column x-coordinates as strings where the coordinates
|
List of column x-coordinates strings where the coordinates
|
||||||
are comma-separated.
|
are comma-separated.
|
||||||
split_text : bool, optional (default: False)
|
split_text : bool, optional (default: False)
|
||||||
Whether or not to split a text line if it spans across
|
Split text that spans across multiple cells.
|
||||||
multiple cells.
|
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Whether or not to highlight a substring using <s></s>
|
Flag text based on font size. Useful to detect
|
||||||
if its size is different from rest of the string. (Useful for
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
super and subscripts)
|
|
||||||
row_close_tol : int, optional (default: 2)
|
row_close_tol : int, optional (default: 2)
|
||||||
Rows will be formed by combining text vertically
|
Tolerance parameter used to combine text vertically,
|
||||||
within this tolerance.
|
to generate rows.
|
||||||
col_close_tol : int, optional (default: 0)
|
col_close_tol : int, optional (default: 0)
|
||||||
Columns will be formed by combining text horizontally
|
Tolerance parameter used to combine text horizontally,
|
||||||
within this tolerance.
|
to generate columns.
|
||||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner char_margin, line_margin and word_margin.
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
|
|
|
||||||
30
setup.py
30
setup.py
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from setuptools import find_packages
|
from setuptools import find_packages
|
||||||
from pkg_resources import parse_version
|
from pkg_resources import parse_version
|
||||||
|
|
@ -8,16 +10,8 @@ about = {}
|
||||||
with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
|
with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
|
||||||
exec(f.read(), about)
|
exec(f.read(), about)
|
||||||
|
|
||||||
# TODO: Move these to __version__.py
|
with open('README.md', 'r') as f:
|
||||||
NAME = 'camelot-py'
|
readme = f.read()
|
||||||
VERSION = about['__version__']
|
|
||||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
|
||||||
with open('README.md') as f:
|
|
||||||
LONG_DESCRIPTION = f.read()
|
|
||||||
URL = 'https://github.com/socialcopsdev/camelot'
|
|
||||||
AUTHOR = 'Vinayak Mehta'
|
|
||||||
AUTHOR_EMAIL = 'vmehta94@gmail.com'
|
|
||||||
LICENSE = 'MIT License'
|
|
||||||
|
|
||||||
|
|
||||||
def setup_package():
|
def setup_package():
|
||||||
|
|
@ -31,14 +25,14 @@ def setup_package():
|
||||||
for line in f:
|
for line in f:
|
||||||
dev_reqs.append(line.strip())
|
dev_reqs.append(line.strip())
|
||||||
|
|
||||||
metadata = dict(name=NAME,
|
metadata = dict(name=about['__title__'],
|
||||||
version=VERSION,
|
version=about['__version__'],
|
||||||
description=DESCRIPTION,
|
description=about['__description__'],
|
||||||
long_description=LONG_DESCRIPTION,
|
long_description=readme,
|
||||||
url=URL,
|
url=about['__url__'],
|
||||||
author=AUTHOR,
|
author=about['__author__'],
|
||||||
author_email=AUTHOR_EMAIL,
|
author_email=about['__author_email__'],
|
||||||
license=LICENSE,
|
license=about['__license__'],
|
||||||
packages=find_packages(exclude=('tests',)),
|
packages=find_packages(exclude=('tests',)),
|
||||||
install_requires=reqs,
|
install_requires=reqs,
|
||||||
extras_require={
|
extras_require={
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue