Add utf8 header

pull/2/head
Vinayak Mehta 2018-09-24 16:27:26 +05:30
parent 3600025a22
commit be2733ebd2
14 changed files with 158 additions and 154 deletions

View File

@ -1,4 +1,4 @@
# Camelot: PDF Table Parsing for Humans # Camelot: PDF Table Extraction for Humans
![license](https://img.shields.io/badge/license-MIT-lightgrey.svg) ![python-version](https://img.shields.io/badge/python-2.7-blue.svg) ![license](https://img.shields.io/badge/license-MIT-lightgrey.svg) ![python-version](https://img.shields.io/badge/python-2.7-blue.svg)
@ -38,7 +38,7 @@
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | | 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | | 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
There's a [command-line interface]() too! There's a [command-line interface](http://camelot-py.readthedocs.io/en/master/user/cli.html) too!
## Why Camelot? ## Why Camelot?
@ -46,13 +46,12 @@ There's a [command-line interface]() too!
- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table. - **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table.
- Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). - Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
- **Export** to multiple formats, including json, excel and html. - **Export** to multiple formats, including json, excel and html.
- Simple and Elegant API, written in **Python**!
See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools). See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
## Installation ## Installation
After [installing the dependencies](), you can simply use pip to install Camelot: After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/), you can simply use pip to install Camelot:
<pre> <pre>
$ pip install camelot-py $ pip install camelot-py
@ -60,7 +59,7 @@ $ pip install camelot-py
### Alternatively ### Alternatively
You can install the dependencies [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/) using your system's package manager. After that, clone the repo using: After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), clone the repo using:
<pre> <pre>
$ git clone https://www.github.com/socialcopsdev/camelot $ git clone https://www.github.com/socialcopsdev/camelot
@ -77,7 +76,7 @@ Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't wan
## Documentation ## Documentation
Great documentation is available at [insert link](). Great documentation is available at [insert link](http://camelot-py.readthedocs.io/).
## Development ## Development

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from .__version__ import __version__ from .__version__ import __version__
from .io import read_pdf from .io import read_pdf

View File

@ -1,3 +1,11 @@
# -*- coding: utf-8 -*-
VERSION = (0, 1, 0) VERSION = (0, 1, 0)
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION)) __version__ = '.'.join(map(str, VERSION))
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License'

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from pprint import pprint from pprint import pprint
import click import click
@ -20,23 +21,22 @@ pass_config = click.make_pass_decorator(Config)
@click.group() @click.group()
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers' @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' to parse. Example: 1,3,4 or 1,4-end') ' Example: 1,3,4 or 1,4-end.')
@click.option('-o', '--output', help='Output file path.') @click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format', @click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html']), type=click.Choice(['csv', 'json', 'excel', 'html']),
help='Output file format.') help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Whether or not to create a ZIP' @click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
' archive.') @click.option('-split', '--split_text', is_flag=True,
@click.option('-split', '--split_text', is_flag=True, help='Whether or not to' help='Split text that spans across multiple cells.')
' split text if it spans across multiple cells.') @click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
@click.option('-flag', '--flag_size', is_flag=True, help='(inactive) Whether or' ' font size. Useful to detect super/subscripts.')
' not to flag text which has uncommon size. (Useful to detect'
' super/subscripts)')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
help='char_margin, line_margin, word_margin for PDFMiner.') help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context @click.pass_context
def cli(ctx, *args, **kwargs): def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans"""
ctx.obj = Config() ctx.obj = Config()
for key, value in kwargs.iteritems(): for key, value in kwargs.iteritems():
ctx.obj.set_config(key, value) ctx.obj.set_config(key, value)
@ -44,45 +44,42 @@ def cli(ctx, *args, **kwargs):
@cli.command('lattice') @cli.command('lattice')
@click.option('-T', '--table_area', default=[], multiple=True, @click.option('-T', '--table_area', default=[], multiple=True,
help='Table areas (x1,y1,x2,y2) to process.\n' help='Table areas to process. Example: x1,y1,x2,y2'
' x1, y1 -> left-top and x2, y2 -> right-bottom') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True, @click.option('-back', '--process_background', is_flag=True,
help='Whether or not to process lines that are in' help='Process background lines.')
' background.')
@click.option('-scale', '--line_size_scaling', default=15, @click.option('-scale', '--line_size_scaling', default=15,
help='Factor by which the page dimensions will be' help='Line size scaling factor. The larger the value,'
' divided to get smallest length of detected lines.') ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
multiple=True, help='Specify direction' multiple=True, help='Direction in which text in a spanning cell'
' in which text will be copied over in a spanning cell.') ' will be copied over.')
@click.option('-shift', '--shift_text', default=['l', 't'], @click.option('-shift', '--shift_text', default=['l', 't'],
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
help='Specify direction in which text in a spanning' help='Direction in which text in a spanning cell will flow.')
' cell should flow.')
@click.option('-l', '--line_close_tol', default=2, @click.option('-l', '--line_close_tol', default=2,
help='Tolerance parameter used to merge close vertical' help='Tolerance parameter used to merge close vertical'
' lines and close horizontal lines.') ' and horizontal lines.')
@click.option('-j', '--joint_close_tol', default=2, @click.option('-j', '--joint_close_tol', default=2,
help='Tolerance parameter used to decide whether' help='Tolerance parameter used to decide whether'
' the detected lines and points lie close to each other.') ' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15, @click.option('-block', '--threshold_blocksize', default=15,
help='For adaptive thresholding, size of a pixel' help='For adaptive thresholding, size of a pixel'
' neighborhood that is used to calculate a threshold value for' ' neighborhood that is used to calculate a threshold value for'
' the pixel: 3, 5, 7, and so on.') ' the pixel. Example: 3, 5, 7, and so on.')
@click.option('-const', '--threshold_constant', default=-2, @click.option('-const', '--threshold_constant', default=-2,
help='For adaptive thresholding, constant subtracted' help='For adaptive thresholding, constant subtracted'
' from the mean or weighted mean.\nNormally, it is positive but' ' from the mean or weighted mean. Normally, it is positive but'
' may be zero or negative as well.') ' may be zero or negative as well.')
@click.option('-I', '--iterations', default=0, @click.option('-I', '--iterations', default=0,
help='Number of times for erosion/dilation is' help='Number of times for erosion/dilation will be applied.')
' applied.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
help='Plot geometry found on PDF page for debugging.') help='Plot geometry found on PDF page, for debugging.')
@click.argument('filepath', type=click.Path(exists=True)) @click.argument('filepath', type=click.Path(exists=True))
@pass_config @pass_config
def lattice(c, *args, **kwargs): def lattice(c, *args, **kwargs):
"""Use lines between text to parse table.""" """Use lines between text to parse the table."""
conf = c.config conf = c.config
pages = conf.pop('pages') pages = conf.pop('pages')
output = conf.pop('output') output = conf.pop('output')
@ -107,27 +104,27 @@ def lattice(c, *args, **kwargs):
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError('Please specify output file path using --output')
if f is None: if f is None:
raise click.UsageError('Please specify output format using --format') raise click.UsageError('Please specify output file format using --format')
tables.export(output, f=f, compress=compress) tables.export(output, f=f, compress=compress)
@cli.command('stream') @cli.command('stream')
@click.option('-T', '--table_area', default=[], multiple=True, @click.option('-T', '--table_area', default=[], multiple=True,
help='Table areas (x1,y1,x2,y2) to process.\n' help='Table areas to process. Example: x1,y1,x2,y2'
' x1, y1 -> left-top and x2, y2 -> right-bottom') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True, @click.option('-C', '--columns', default=[], multiple=True,
help='x-coordinates of column separators.') help='X coordinates of column separators.')
@click.option('-r', '--row_close_tol', default=2, help='Rows will be' @click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
' formed by combining text vertically within this tolerance.') ' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Columns will' @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
' be formed by combining text horizontally within this tolerance.') ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table']), type=click.Choice(['text', 'table']),
help='Plot geometry found on PDF page for debugging.') help='Plot geometry found on PDF page for debugging.')
@click.argument('filepath', type=click.Path(exists=True)) @click.argument('filepath', type=click.Path(exists=True))
@pass_config @pass_config
def stream(c, *args, **kwargs): def stream(c, *args, **kwargs):
"""Use spaces between text to parse table.""" """Use spaces between text to parse the table."""
conf = c.config conf = c.config
pages = conf.pop('pages') pages = conf.pop('pages')
output = conf.pop('output') output = conf.pop('output')
@ -151,5 +148,5 @@ def stream(c, *args, **kwargs):
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError('Please specify output file path using --output')
if f is None: if f is None:
raise click.UsageError('Please specify output format using --format') raise click.UsageError('Please specify output file format using --format')
tables.export(output, f=f, compress=compress) tables.export(output, f=f, compress=compress)

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
import json import json
import zipfile import zipfile
@ -11,7 +13,7 @@ from .plotting import *
class Cell(object): class Cell(object):
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
left-bottom origin. (pdf coordinate space) left-bottom origin. (PDF coordinate space)
Parameters Parameters
---------- ----------
@ -89,7 +91,7 @@ class Cell(object):
class Table(object): class Table(object):
"""Defines a table with coordinates relative to a left-bottom """Defines a table with coordinates relative to a left-bottom
origin. (pdf coordinate space) origin. (PDF coordinate space)
Parameters Parameters
---------- ----------
@ -110,9 +112,9 @@ class Table(object):
whitespace : float whitespace : float
Percentage of whitespace in the table. Percentage of whitespace in the table.
order : int order : int
Table number on pdf page. Table number on PDF page.
page : int page : int
Pdf page number. PDF page number.
""" """
def __init__(self, cols, rows): def __init__(self, cols, rows):

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
@ -10,16 +12,16 @@ from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
class PDFHandler(object): class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting """Handles all operations like temp directory creation, splitting
file into single page pdfs, parsing each pdf and then removing the file into single page PDFs, parsing each PDF and then removing the
temp directory. temp directory.
Parameters Parameters
---------- ----------
filename : str filename : str
Path to pdf file. Path to PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end.
""" """
def __init__(self, filename, pages='1'): def __init__(self, filename, pages='1'):
@ -34,10 +36,10 @@ class PDFHandler(object):
Parameters Parameters
---------- ----------
filename : str filename : str
Path to pdf file. Path to PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end.
Returns Returns
------- -------
@ -67,16 +69,16 @@ class PDFHandler(object):
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filename, page, temp): def _save_page(self, filename, page, temp):
"""Saves specified page from pdf into a temporary directory. """Saves specified page from PDF into a temporary directory.
Parameters Parameters
---------- ----------
filename : str filename : str
Path to pdf file. Path to PDF file.
page : int page : int
Page number Page number.
temp : str temp : str
Tmp directory Tmp directory.
""" """
with open(filename, 'rb') as fileobj: with open(filename, 'rb') as fileobj:
@ -91,7 +93,7 @@ class PDFHandler(object):
with open(fpath, 'wb') as f: with open(fpath, 'wb') as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated pdf # fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh") lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv") lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
@ -114,7 +116,7 @@ class PDFHandler(object):
def parse(self, flavor='lattice', **kwargs): def parse(self, flavor='lattice', **kwargs):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page pdfs. page PDFs.
Parameters Parameters
---------- ----------
@ -127,10 +129,10 @@ class PDFHandler(object):
Returns Returns
------- -------
tables : camelot.core.TableList tables : camelot.core.TableList
List of tables found in pdf. List of tables found in PDF.
geometry : camelot.core.GeometryList geometry : camelot.core.GeometryList
List of geometry objects (contours, lines, joints) List of geometry objects (contours, lines, joints) found
found in pdf. in PDF.
""" """
tables = [] tables = []

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter

View File

@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs): def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
"""Read PDF and return parsed data tables. """Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream' Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with flavor='lattice'. and kwargs annotated with * can only be used with flavor='lattice'.
@ -11,53 +13,47 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
Parameters Parameters
---------- ----------
filepath : str filepath : str
Path to pdf file. Path to PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end.
flavor : str (default: 'lattice') flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream'). The parsing method to use ('lattice' or 'stream').
Lattice is used by default. Lattice is used by default.
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to process as strings of the form List of table area strings of the form x1,y1,x2,y2
x1,y1,x2,y2 where (x1, y1) -> left-top and where (x1, y1) -> left-top and (x2, y2) -> right-bottom
(x2, y2) -> right-bottom in pdf coordinate space. in PDF coordinate space.
columns^ : list, optional (default: None) columns^ : list, optional (default: None)
List of column x-coordinates as strings where the coordinates List of column x-coordinates strings where the coordinates
are comma-separated. are comma-separated.
split_text : bool, optional (default: False) split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across Split text that spans across multiple cells.
multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Flag text based on font size. Useful to detect
if its size is different from rest of the string. (Useful for super/subscripts. Adds <s></s> around flagged text.
super and subscripts)
row_close_tol^ : int, optional (default: 2) row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically Tolerance parameter used to combine text vertically,
within this tolerance. to generate rows.
col_close_tol^ : int, optional (default: 0) col_close_tol^ : int, optional (default: 0)
Columns will be formed by combining text horizontally Tolerance parameter used to combine text horizontally,
within this tolerance. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
Whether or not to process lines that are in background. Process background lines.
line_size_scaling* : int, optional (default: 15) line_size_scaling* : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Line size scaling factor. The larger the value the smaller
smallest length of lines that should be detected. the detected lines. Making it very large will lead to text
being detected as lines.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text* : list, optional (default: None) copy_text* : list, optional (default: None)
{'h', 'v'} {'h', 'v'}
Select one or more strings from above and pass them as a list Direction in which text in a spanning cell will be copied
to specify the direction in which text should be copied over over.
when a cell spans multiple rows or columns.
shift_text* : list, optional (default: ['l', 't']) shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'} {'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list Direction in which text in a spanning cell will flow.
to specify where the text in a spanning cell should flow.
line_close_tol* : int, optional (default: 2) line_close_tol* : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
detected lines which lie close to each other. lines.
joint_close_tol* : int, optional (default: 2) joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
@ -76,7 +72,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,2 +1,4 @@
# -*- coding: utf-8 -*-
from .stream import Stream from .stream import Stream
from .lattice import Lattice from .lattice import Lattice

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
from ..utils import get_page_layout, get_text_objects from ..utils import get_page_layout, get_text_objects

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import os import os
import copy import copy
@ -21,41 +23,35 @@ logger = setup_logging(__name__)
class Lattice(BaseParser): class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text """Lattice method of parsing looks for lines between text
to parse table. to parse the table.
Parameters Parameters
---------- ----------
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to analyze as strings of the form List of table area strings of the form x1,y1,x2,y2
x1,y1,x2,y2 where (x1, y1) -> left-top and where (x1, y1) -> left-top and (x2, y2) -> right-bottom
(x2, y2) -> right-bottom in pdf coordinate space. in PDF coordinate space.
process_background : bool, optional (default: False) process_background : bool, optional (default: False)
Whether or not to process lines that are in background. Process background lines.
line_size_scaling : int, optional (default: 15) line_size_scaling : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Line size scaling factor. The larger the value the smaller
smallest length of lines that should be detected. the detected lines. Making it very large will lead to text
being detected as lines.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text : list, optional (default: None) copy_text : list, optional (default: None)
{'h', 'v'} {'h', 'v'}
Select one or more strings from above and pass them as a list Direction in which text in a spanning cell will be copied
to specify the direction in which text should be copied over over.
when a cell spans multiple rows or columns.
shift_text : list, optional (default: ['l', 't']) shift_text : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'} {'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list Direction in which text in a spanning cell will flow.
to specify where the text in a spanning cell should flow.
split_text : bool, optional (default: False) split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across Split text that spans across multiple cells.
multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Flag text based on font size. Useful to detect
if its size is different from rest of the string. (Useful for super/subscripts. Adds <s></s> around flagged text.
super and subscripts)
line_close_tol : int, optional (default: 2) line_close_tol : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
detected lines which lie close to each other. lines.
joint_close_tol : int, optional (default: 2) joint_close_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
@ -74,7 +70,7 @@ class Lattice(BaseParser):
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import os import os
import logging import logging
@ -16,7 +18,7 @@ logger = setup_logging(__name__)
class Stream(BaseParser): class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text """Stream method of parsing looks for spaces between text
to parse table. to parse the table.
If you want to specify columns when specifying multiple table If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal. areas, make sure that the length of both lists are equal.
@ -24,27 +26,25 @@ class Stream(BaseParser):
Parameters Parameters
---------- ----------
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to analyze as strings of the form List of table area strings of the form x1,y1,x2,y2
x1,y1,x2,y2 where (x1, y1) -> left-top and where (x1, y1) -> left-top and (x2, y2) -> right-bottom
(x2, y2) -> right-bottom in pdf coordinate space. in PDF coordinate space.
columns : list, optional (default: None) columns : list, optional (default: None)
List of column x-coordinates as strings where the coordinates List of column x-coordinates strings where the coordinates
are comma-separated. are comma-separated.
split_text : bool, optional (default: False) split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across Split text that spans across multiple cells.
multiple cells.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s> Flag text based on font size. Useful to detect
if its size is different from rest of the string. (Useful for super/subscripts. Adds <s></s> around flagged text.
super and subscripts)
row_close_tol : int, optional (default: 2) row_close_tol : int, optional (default: 2)
Rows will be formed by combining text vertically Tolerance parameter used to combine text vertically,
within this tolerance. to generate rows.
col_close_tol : int, optional (default: 0) col_close_tol : int, optional (default: 0)
Columns will be formed by combining text horizontally Tolerance parameter used to combine text horizontally,
within this tolerance. to generate columns.
margins : tuple, optional (default: (1.0, 0.5, 0.1)) margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
from setuptools import find_packages from setuptools import find_packages
from pkg_resources import parse_version from pkg_resources import parse_version
@ -8,16 +10,8 @@ about = {}
with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f: with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
exec(f.read(), about) exec(f.read(), about)
# TODO: Move these to __version__.py with open('README.md', 'r') as f:
NAME = 'camelot-py' readme = f.read()
VERSION = about['__version__']
DESCRIPTION = 'PDF Table Parsing for Humans'
with open('README.md') as f:
LONG_DESCRIPTION = f.read()
URL = 'https://github.com/socialcopsdev/camelot'
AUTHOR = 'Vinayak Mehta'
AUTHOR_EMAIL = 'vmehta94@gmail.com'
LICENSE = 'MIT License'
def setup_package(): def setup_package():
@ -31,14 +25,14 @@ def setup_package():
for line in f: for line in f:
dev_reqs.append(line.strip()) dev_reqs.append(line.strip())
metadata = dict(name=NAME, metadata = dict(name=about['__title__'],
version=VERSION, version=about['__version__'],
description=DESCRIPTION, description=about['__description__'],
long_description=LONG_DESCRIPTION, long_description=readme,
url=URL, url=about['__url__'],
author=AUTHOR, author=about['__author__'],
author_email=AUTHOR_EMAIL, author_email=about['__author_email__'],
license=LICENSE, license=about['__license__'],
packages=find_packages(exclude=('tests',)), packages=find_packages(exclude=('tests',)),
install_requires=reqs, install_requires=reqs,
extras_require={ extras_require={

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
import pandas as pd import pandas as pd