Update CLI help and README
parent
7bb1aee9b6
commit
544e0c9c3f
78
README.md
78
README.md
|
|
@ -4,6 +4,8 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
### API
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
>>> import camelot
|
>>> import camelot
|
||||||
>>> tables = camelot.read_pdf("foo.pdf")
|
>>> tables = camelot.read_pdf("foo.pdf")
|
||||||
|
|
@ -23,6 +25,82 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
|
||||||
>>> df = tables[0].df
|
>>> df = tables[0].df
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
### Command-line interface
|
||||||
|
|
||||||
|
<pre>
|
||||||
|
Usage: camelot [OPTIONS] FILEPATH
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-p, --pages TEXT Comma-separated page numbers to parse.
|
||||||
|
Example: 1,3,4 or 1,4-end
|
||||||
|
-o, --output TEXT Output filepath.
|
||||||
|
-f, --format [csv|json|excel|html]
|
||||||
|
Output file format.
|
||||||
|
-z, --zip Whether or not to create a ZIP archive.
|
||||||
|
-m, --mesh Whether or not to use Lattice method of
|
||||||
|
parsing. Stream is used by default.
|
||||||
|
-T, --table_area TEXT Table areas (x1,y1,x2,y2) to process.
|
||||||
|
x1, y1
|
||||||
|
-> left-top and x2, y2 -> right-bottom
|
||||||
|
-split, --split_text Whether or not to split text if it spans
|
||||||
|
across multiple cells.
|
||||||
|
-flag, --flag_size (inactive) Whether or not to flag text which
|
||||||
|
has uncommon size. (Useful to detect
|
||||||
|
super/subscripts)
|
||||||
|
-M, --margins <FLOAT FLOAT FLOAT>...
|
||||||
|
char_margin, line_margin, word_margin for
|
||||||
|
PDFMiner.
|
||||||
|
-C, --columns TEXT x-coordinates of column separators.
|
||||||
|
-r, --row_close_tol INTEGER Rows will be formed by combining text
|
||||||
|
vertically within this tolerance.
|
||||||
|
-c, --col_close_tol INTEGER Columns will be formed by combining text
|
||||||
|
horizontally within this tolerance.
|
||||||
|
-back, --process_background (with --mesh) Whether or not to process
|
||||||
|
lines that are in background.
|
||||||
|
-scale, --line_size_scaling INTEGER
|
||||||
|
(with --mesh) Factor by which the page
|
||||||
|
dimensions will be divided to get smallest
|
||||||
|
length of detected lines.
|
||||||
|
-copy, --copy_text [h|v] (with --mesh) Specify direction in which
|
||||||
|
text will be copied over in a spanning cell.
|
||||||
|
-shift, --shift_text [l|r|t|b] (with --mesh) Specify direction in which
|
||||||
|
text in a spanning cell should flow.
|
||||||
|
-l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to
|
||||||
|
merge close vertical lines and close
|
||||||
|
horizontal lines.
|
||||||
|
-j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to
|
||||||
|
decide whether the detected lines and points
|
||||||
|
lie close to each other.
|
||||||
|
-block, --threshold_blocksize INTEGER
|
||||||
|
(with --mesh) For adaptive thresholding,
|
||||||
|
size of a pixel neighborhood that is used to
|
||||||
|
calculate a threshold value for the pixel:
|
||||||
|
3, 5, 7, and so on.
|
||||||
|
-const, --threshold_constant INTEGER
|
||||||
|
(with --mesh) For adaptive thresholding,
|
||||||
|
constant subtracted from the mean or
|
||||||
|
weighted mean.
|
||||||
|
Normally, it is positive but
|
||||||
|
may be zero or negative as well.
|
||||||
|
-I, --iterations INTEGER (with --mesh) Number of times for
|
||||||
|
erosion/dilation is applied.
|
||||||
|
-G, --geometry_type [text|table|contour|joint|line]
|
||||||
|
Plot geometry found on pdf page for
|
||||||
|
debugging.
|
||||||
|
|
||||||
|
text: Plot text objects. (Useful
|
||||||
|
to get table_area and columns coordinates)
|
||||||
|
table: Plot parsed table.
|
||||||
|
contour (with
|
||||||
|
--mesh): Plot detected rectangles.
|
||||||
|
joint
|
||||||
|
(with --mesh): Plot detected line
|
||||||
|
intersections.
|
||||||
|
line (with --mesh): Plot
|
||||||
|
detected lines.
|
||||||
|
--help Show this message and exit.
|
||||||
|
</pre>
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
|
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
|
||||||
|
|
|
||||||
|
|
@ -17,44 +17,69 @@ class Mutex(click.Option):
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option("-p", "--pages", default="1", help="")
|
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
||||||
@click.option("-o", "--output", help="")
|
" to parse. Example: 1,3,4 or 1,4-end")
|
||||||
|
@click.option("-o", "--output", help="Output filepath.")
|
||||||
@click.option("-f", "--format",
|
@click.option("-f", "--format",
|
||||||
type=click.Choice(["csv", "json", "excel", "html"]), help="")
|
type=click.Choice(["csv", "json", "excel", "html"]),
|
||||||
@click.option("-z", "--zip", is_flag=True, help="")
|
help="Output file format.")
|
||||||
|
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
||||||
|
" archive.")
|
||||||
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
|
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
|
||||||
" use Lattice method of parsing. Stream is used by default.")
|
" use Lattice method of parsing. Stream is used by default.")
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
|
||||||
|
" split text if it spans across multiple cells.")
|
||||||
|
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
|
||||||
|
" not to flag text which has uncommon size. (Useful to detect"
|
||||||
|
" super/subscripts)")
|
||||||
|
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
|
help="char_margin, line_margin, word_margin for PDFMiner.")
|
||||||
|
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
||||||
|
help="x-coordinates of column separators.")
|
||||||
|
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
|
||||||
|
" formed by combining text vertically within this tolerance.")
|
||||||
|
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
|
||||||
|
" be formed by combining text horizontally within this tolerance.")
|
||||||
|
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
||||||
|
help="(with --mesh) Whether or not to process lines that are in"
|
||||||
|
" background.")
|
||||||
|
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
|
||||||
|
help="(with --mesh) Factor by which the page dimensions will be"
|
||||||
|
" divided to get smallest length of detected lines.")
|
||||||
|
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
|
||||||
|
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
|
||||||
|
" in which text will be copied over in a spanning cell.")
|
||||||
|
@click.option("-shift", "--shift_text", default=["l", "t"],
|
||||||
|
type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex,
|
||||||
|
help="(with --mesh) Specify direction in which text in a spanning"
|
||||||
|
" cell should flow.")
|
||||||
|
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
|
||||||
|
help="(with --mesh) Tolerance parameter used to merge close vertical"
|
||||||
|
" lines and close horizontal lines.")
|
||||||
|
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
|
||||||
|
help="(with --mesh) Tolerance parameter used to decide whether"
|
||||||
|
" the detected lines and points lie close to each other.")
|
||||||
|
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
|
||||||
|
help="(with --mesh) For adaptive thresholding, size of a pixel"
|
||||||
|
" neighborhood that is used to calculate a threshold value for"
|
||||||
|
" the pixel: 3, 5, 7, and so on.")
|
||||||
|
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
|
||||||
|
help="(with --mesh) For adaptive thresholding, constant subtracted"
|
||||||
|
" from the mean or weighted mean.\nNormally, it is positive but"
|
||||||
|
" may be zero or negative as well.")
|
||||||
|
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
||||||
|
help="(with --mesh) Number of times for erosion/dilation is"
|
||||||
|
" applied.")
|
||||||
@click.option("-G", "--geometry_type",
|
@click.option("-G", "--geometry_type",
|
||||||
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
||||||
help="Plot geometry found on pdf page for debugging.")
|
help="Plot geometry found on pdf page for debugging.\n\n"
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
"text: Plot text objects. (Useful to get table_area and"
|
||||||
help="")
|
" columns coordinates)\ntable: Plot parsed table.\n"
|
||||||
@click.option("-split", "--split_text", is_flag=True, help="")
|
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
|
||||||
@click.option("-flag", "--flag_size", is_flag=True, help="")
|
" intersections.\nline (with --mesh): Plot detected lines.")
|
||||||
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
|
||||||
help="")
|
|
||||||
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
|
||||||
help="")
|
|
||||||
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="")
|
|
||||||
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="")
|
|
||||||
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-copy", "--copy_text", default=[], cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
|
||||||
help="Use with --mesh")
|
|
||||||
@click.argument("filepath", type=click.Path(exists=True))
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
def cli(*args, **kwargs):
|
def cli(*args, **kwargs):
|
||||||
pages = kwargs.pop("pages")
|
pages = kwargs.pop("pages")
|
||||||
|
|
@ -69,6 +94,9 @@ def cli(*args, **kwargs):
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs['columns'] = None if not columns else columns
|
||||||
|
copy_text = list(kwargs['copy_text'])
|
||||||
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
kwargs = remove_extra(kwargs, mesh=mesh)
|
||||||
if geometry_type is None:
|
if geometry_type is None:
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
is used by default.
|
is used by default.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to analyze as strings of the form
|
List of table areas to process as strings of the form
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
columns^ : list, optional (default: None)
|
columns^ : list, optional (default: None)
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs)
|
||||||
'joint'* : Plot detected line intersections.
|
'joint'* : Plot detected line intersections.
|
||||||
'line'* : Plot detected lines.
|
'line'* : Plot detected lines.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to analyze as strings of the form
|
List of table areas to process as strings of the form
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||||
columns^ : list, optional (default: None)
|
columns^ : list, optional (default: None)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue