Add utf8 header
parent
3600025a22
commit
be2733ebd2
13
README.md
13
README.md
|
|
@ -1,4 +1,4 @@
|
|||
# Camelot: PDF Table Parsing for Humans
|
||||
# Camelot: PDF Table Extraction for Humans
|
||||
|
||||
 
|
||||
|
||||
|
|
@ -38,7 +38,7 @@
|
|||
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
|
||||
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
|
||||
|
||||
There's a [command-line interface]() too!
|
||||
There's a [command-line interface](http://camelot-py.readthedocs.io/en/master/user/cli.html) too!
|
||||
|
||||
## Why Camelot?
|
||||
|
||||
|
|
@ -46,13 +46,12 @@ There's a [command-line interface]() too!
|
|||
- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table.
|
||||
- Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
|
||||
- **Export** to multiple formats, including json, excel and html.
|
||||
- Simple and Elegant API, written in **Python**!
|
||||
|
||||
See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools).
|
||||
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
||||
|
||||
## Installation
|
||||
|
||||
After [installing the dependencies](), you can simply use pip to install Camelot:
|
||||
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/), you can simply use pip to install Camelot:
|
||||
|
||||
<pre>
|
||||
$ pip install camelot-py
|
||||
|
|
@ -60,7 +59,7 @@ $ pip install camelot-py
|
|||
|
||||
### Alternatively
|
||||
|
||||
You can install the dependencies [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/) using your system's package manager. After that, clone the repo using:
|
||||
After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), clone the repo using:
|
||||
|
||||
<pre>
|
||||
$ git clone https://www.github.com/socialcopsdev/camelot
|
||||
|
|
@ -77,7 +76,7 @@ Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't wan
|
|||
|
||||
## Documentation
|
||||
|
||||
Great documentation is available at [insert link]().
|
||||
Great documentation is available at [insert link](http://camelot-py.readthedocs.io/).
|
||||
|
||||
## Development
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .__version__ import __version__
|
||||
|
||||
from .io import read_pdf
|
||||
|
|
@ -1,3 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
VERSION = (0, 1, 0)
|
||||
|
||||
__title__ = 'camelot-py'
|
||||
__description__ = 'PDF Table Extraction for Humans.'
|
||||
__url__ = 'http://camelot-py.readthedocs.io/'
|
||||
__version__ = '.'.join(map(str, VERSION))
|
||||
__author__ = 'Vinayak Mehta'
|
||||
__author_email__ = 'vmehta94@gmail.com'
|
||||
__license__ = 'MIT License'
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
import click
|
||||
|
|
@ -20,23 +21,22 @@ pass_config = click.make_pass_decorator(Config)
|
|||
|
||||
@click.group()
|
||||
@click.version_option(version=__version__)
|
||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers'
|
||||
' to parse. Example: 1,3,4 or 1,4-end')
|
||||
@click.option('-o', '--output', help='Output filepath.')
|
||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||
' Example: 1,3,4 or 1,4-end.')
|
||||
@click.option('-o', '--output', help='Output file path.')
|
||||
@click.option('-f', '--format',
|
||||
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
||||
help='Output file format.')
|
||||
@click.option('-z', '--zip', is_flag=True, help='Whether or not to create a ZIP'
|
||||
' archive.')
|
||||
@click.option('-split', '--split_text', is_flag=True, help='Whether or not to'
|
||||
' split text if it spans across multiple cells.')
|
||||
@click.option('-flag', '--flag_size', is_flag=True, help='(inactive) Whether or'
|
||||
' not to flag text which has uncommon size. (Useful to detect'
|
||||
' super/subscripts)')
|
||||
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
||||
@click.option('-split', '--split_text', is_flag=True,
|
||||
help='Split text that spans across multiple cells.')
|
||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||
' font size. Useful to detect super/subscripts.')
|
||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||
help='char_margin, line_margin, word_margin for PDFMiner.')
|
||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||
@click.pass_context
|
||||
def cli(ctx, *args, **kwargs):
|
||||
"""Camelot: PDF Table Extraction for Humans"""
|
||||
ctx.obj = Config()
|
||||
for key, value in kwargs.iteritems():
|
||||
ctx.obj.set_config(key, value)
|
||||
|
|
@ -44,45 +44,42 @@ def cli(ctx, *args, **kwargs):
|
|||
|
||||
@cli.command('lattice')
|
||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
||||
help='Table areas (x1,y1,x2,y2) to process.\n'
|
||||
' x1, y1 -> left-top and x2, y2 -> right-bottom')
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-back', '--process_background', is_flag=True,
|
||||
help='Whether or not to process lines that are in'
|
||||
' background.')
|
||||
help='Process background lines.')
|
||||
@click.option('-scale', '--line_size_scaling', default=15,
|
||||
help='Factor by which the page dimensions will be'
|
||||
' divided to get smallest length of detected lines.')
|
||||
help='Line size scaling factor. The larger the value,'
|
||||
' the smaller the detected lines.')
|
||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
||||
multiple=True, help='Specify direction'
|
||||
' in which text will be copied over in a spanning cell.')
|
||||
multiple=True, help='Direction in which text in a spanning cell'
|
||||
' will be copied over.')
|
||||
@click.option('-shift', '--shift_text', default=['l', 't'],
|
||||
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
||||
help='Specify direction in which text in a spanning'
|
||||
' cell should flow.')
|
||||
help='Direction in which text in a spanning cell will flow.')
|
||||
@click.option('-l', '--line_close_tol', default=2,
|
||||
help='Tolerance parameter used to merge close vertical'
|
||||
' lines and close horizontal lines.')
|
||||
' and horizontal lines.')
|
||||
@click.option('-j', '--joint_close_tol', default=2,
|
||||
help='Tolerance parameter used to decide whether'
|
||||
' the detected lines and points lie close to each other.')
|
||||
@click.option('-block', '--threshold_blocksize', default=15,
|
||||
help='For adaptive thresholding, size of a pixel'
|
||||
' neighborhood that is used to calculate a threshold value for'
|
||||
' the pixel: 3, 5, 7, and so on.')
|
||||
' the pixel. Example: 3, 5, 7, and so on.')
|
||||
@click.option('-const', '--threshold_constant', default=-2,
|
||||
help='For adaptive thresholding, constant subtracted'
|
||||
' from the mean or weighted mean.\nNormally, it is positive but'
|
||||
' from the mean or weighted mean. Normally, it is positive but'
|
||||
' may be zero or negative as well.')
|
||||
@click.option('-I', '--iterations', default=0,
|
||||
help='Number of times for erosion/dilation is'
|
||||
' applied.')
|
||||
help='Number of times for erosion/dilation will be applied.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
|
||||
help='Plot geometry found on PDF page for debugging.')
|
||||
help='Plot geometry found on PDF page, for debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def lattice(c, *args, **kwargs):
|
||||
"""Use lines between text to parse table."""
|
||||
"""Use lines between text to parse the table."""
|
||||
conf = c.config
|
||||
pages = conf.pop('pages')
|
||||
output = conf.pop('output')
|
||||
|
|
@ -105,29 +102,29 @@ def lattice(c, *args, **kwargs):
|
|||
table.plot(plot_type)
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output filepath using --output')
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
if f is None:
|
||||
raise click.UsageError('Please specify output format using --format')
|
||||
raise click.UsageError('Please specify output file format using --format')
|
||||
tables.export(output, f=f, compress=compress)
|
||||
|
||||
|
||||
@cli.command('stream')
|
||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
||||
help='Table areas (x1,y1,x2,y2) to process.\n'
|
||||
' x1, y1 -> left-top and x2, y2 -> right-bottom')
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-C', '--columns', default=[], multiple=True,
|
||||
help='x-coordinates of column separators.')
|
||||
@click.option('-r', '--row_close_tol', default=2, help='Rows will be'
|
||||
' formed by combining text vertically within this tolerance.')
|
||||
@click.option('-c', '--col_close_tol', default=0, help='Columns will'
|
||||
' be formed by combining text horizontally within this tolerance.')
|
||||
help='X coordinates of column separators.')
|
||||
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
||||
' used to combine text vertically, to generate rows.')
|
||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||
' used to combine text horizontally, to generate columns.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'table']),
|
||||
help='Plot geometry found on PDF page for debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def stream(c, *args, **kwargs):
|
||||
"""Use spaces between text to parse table."""
|
||||
"""Use spaces between text to parse the table."""
|
||||
conf = c.config
|
||||
pages = conf.pop('pages')
|
||||
output = conf.pop('output')
|
||||
|
|
@ -149,7 +146,7 @@ def stream(c, *args, **kwargs):
|
|||
table.plot(plot_type)
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output filepath using --output')
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
if f is None:
|
||||
raise click.UsageError('Please specify output format using --format')
|
||||
raise click.UsageError('Please specify output file format using --format')
|
||||
tables.export(output, f=f, compress=compress)
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import json
|
||||
import zipfile
|
||||
|
|
@ -11,7 +13,7 @@ from .plotting import *
|
|||
|
||||
class Cell(object):
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
left-bottom origin. (pdf coordinate space)
|
||||
left-bottom origin. (PDF coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -89,7 +91,7 @@ class Cell(object):
|
|||
|
||||
class Table(object):
|
||||
"""Defines a table with coordinates relative to a left-bottom
|
||||
origin. (pdf coordinate space)
|
||||
origin. (PDF coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -110,9 +112,9 @@ class Table(object):
|
|||
whitespace : float
|
||||
Percentage of whitespace in the table.
|
||||
order : int
|
||||
Table number on pdf page.
|
||||
Table number on PDF page.
|
||||
page : int
|
||||
Pdf page number.
|
||||
PDF page number.
|
||||
|
||||
"""
|
||||
def __init__(self, cols, rows):
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
|
@ -10,16 +12,16 @@ from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
|||
|
||||
class PDFHandler(object):
|
||||
"""Handles all operations like temp directory creation, splitting
|
||||
file into single page pdfs, parsing each pdf and then removing the
|
||||
file into single page PDFs, parsing each PDF and then removing the
|
||||
temp directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
Path to PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
Comma-separated page numbers.
|
||||
Example: 1,3,4 or 1,4-end.
|
||||
|
||||
"""
|
||||
def __init__(self, filename, pages='1'):
|
||||
|
|
@ -34,10 +36,10 @@ class PDFHandler(object):
|
|||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
Path to PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
Comma-separated page numbers.
|
||||
Example: 1,3,4 or 1,4-end.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -67,16 +69,16 @@ class PDFHandler(object):
|
|||
return sorted(set(P))
|
||||
|
||||
def _save_page(self, filename, page, temp):
|
||||
"""Saves specified page from pdf into a temporary directory.
|
||||
"""Saves specified page from PDF into a temporary directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
Path to PDF file.
|
||||
page : int
|
||||
Page number
|
||||
Page number.
|
||||
temp : str
|
||||
Tmp directory
|
||||
Tmp directory.
|
||||
|
||||
"""
|
||||
with open(filename, 'rb') as fileobj:
|
||||
|
|
@ -91,7 +93,7 @@ class PDFHandler(object):
|
|||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(fpath)
|
||||
# fix rotated pdf
|
||||
# fix rotated PDF
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
|
|
@ -114,7 +116,7 @@ class PDFHandler(object):
|
|||
|
||||
def parse(self, flavor='lattice', **kwargs):
|
||||
"""Extracts tables by calling parser.get_tables on all single
|
||||
page pdfs.
|
||||
page PDFs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -127,10 +129,10 @@ class PDFHandler(object):
|
|||
Returns
|
||||
-------
|
||||
tables : camelot.core.TableList
|
||||
List of tables found in pdf.
|
||||
List of tables found in PDF.
|
||||
geometry : camelot.core.GeometryList
|
||||
List of geometry objects (contours, lines, joints)
|
||||
found in pdf.
|
||||
List of geometry objects (contours, lines, joints) found
|
||||
in PDF.
|
||||
|
||||
"""
|
||||
tables = []
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import division
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .handlers import PDFHandler
|
||||
from .utils import validate_input, remove_extra
|
||||
|
||||
|
||||
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||
"""Read PDF and return parsed data tables.
|
||||
"""Read PDF and return extracted tables.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||
and kwargs annotated with * can only be used with flavor='lattice'.
|
||||
|
|
@ -11,53 +13,47 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
|||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Path to pdf file.
|
||||
Path to PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
Comma-separated page numbers.
|
||||
Example: 1,3,4 or 1,4-end.
|
||||
flavor : str (default: 'lattice')
|
||||
The parsing method to use ('lattice' or 'stream').
|
||||
Lattice is used by default.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to process as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
List of column x-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
Tolerance parameter used to combine text vertically,
|
||||
to generate rows.
|
||||
col_close_tol^ : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
Tolerance parameter used to combine text horizontally,
|
||||
to generate columns.
|
||||
process_background* : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
Process background lines.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
Line size scaling factor. The larger the value the smaller
|
||||
the detected lines. Making it very large will lead to text
|
||||
being detected as lines.
|
||||
copy_text* : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
Direction in which text in a spanning cell will be copied
|
||||
over.
|
||||
shift_text* : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
Direction in which text in a spanning cell will flow.
|
||||
line_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
Tolerance parameter used to merge close vertical and horizontal
|
||||
lines.
|
||||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
|
|
@ -76,7 +72,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
|||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
PDFMiner char_margin, line_margin and word_margin.
|
||||
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,2 +1,4 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .stream import Stream
|
||||
from .lattice import Lattice
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
from ..utils import get_page_layout, get_text_objects
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import copy
|
||||
|
|
@ -21,41 +23,35 @@ logger = setup_logging(__name__)
|
|||
|
||||
class Lattice(BaseParser):
|
||||
"""Lattice method of parsing looks for lines between text
|
||||
to parse table.
|
||||
to parse the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
process_background : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
Process background lines.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
Line size scaling factor. The larger the value the smaller
|
||||
the detected lines. Making it very large will lead to text
|
||||
being detected as lines.
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
Direction in which text in a spanning cell will be copied
|
||||
over.
|
||||
shift_text : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
Direction in which text in a spanning cell will flow.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
line_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
Tolerance parameter used to merge close vertical and horizontal
|
||||
lines.
|
||||
joint_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
|
|
@ -74,7 +70,7 @@ class Lattice(BaseParser):
|
|||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
PDFMiner char_margin, line_margin and word_margin.
|
||||
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import logging
|
||||
|
|
@ -16,7 +18,7 @@ logger = setup_logging(__name__)
|
|||
|
||||
class Stream(BaseParser):
|
||||
"""Stream method of parsing looks for spaces between text
|
||||
to parse table.
|
||||
to parse the table.
|
||||
|
||||
If you want to specify columns when specifying multiple table
|
||||
areas, make sure that the length of both lists are equal.
|
||||
|
|
@ -24,27 +26,25 @@ class Stream(BaseParser):
|
|||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
columns : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
List of column x-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
Tolerance parameter used to combine text vertically,
|
||||
to generate rows.
|
||||
col_close_tol : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
Tolerance parameter used to combine text horizontally,
|
||||
to generate columns.
|
||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
PDFMiner char_margin, line_margin and word_margin.
|
||||
|
||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
|
||||
|
|
|
|||
30
setup.py
30
setup.py
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
from setuptools import find_packages
|
||||
from pkg_resources import parse_version
|
||||
|
|
@ -8,16 +10,8 @@ about = {}
|
|||
with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
|
||||
exec(f.read(), about)
|
||||
|
||||
# TODO: Move these to __version__.py
|
||||
NAME = 'camelot-py'
|
||||
VERSION = about['__version__']
|
||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||
with open('README.md') as f:
|
||||
LONG_DESCRIPTION = f.read()
|
||||
URL = 'https://github.com/socialcopsdev/camelot'
|
||||
AUTHOR = 'Vinayak Mehta'
|
||||
AUTHOR_EMAIL = 'vmehta94@gmail.com'
|
||||
LICENSE = 'MIT License'
|
||||
with open('README.md', 'r') as f:
|
||||
readme = f.read()
|
||||
|
||||
|
||||
def setup_package():
|
||||
|
|
@ -31,14 +25,14 @@ def setup_package():
|
|||
for line in f:
|
||||
dev_reqs.append(line.strip())
|
||||
|
||||
metadata = dict(name=NAME,
|
||||
version=VERSION,
|
||||
description=DESCRIPTION,
|
||||
long_description=LONG_DESCRIPTION,
|
||||
url=URL,
|
||||
author=AUTHOR,
|
||||
author_email=AUTHOR_EMAIL,
|
||||
license=LICENSE,
|
||||
metadata = dict(name=about['__title__'],
|
||||
version=about['__version__'],
|
||||
description=about['__description__'],
|
||||
long_description=readme,
|
||||
url=about['__url__'],
|
||||
author=about['__author__'],
|
||||
author_email=about['__author_email__'],
|
||||
license=about['__license__'],
|
||||
packages=find_packages(exclude=('tests',)),
|
||||
install_requires=reqs,
|
||||
extras_require={
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
|
|
|||
Loading…
Reference in New Issue