From 544e0c9c3f8c372e4a70aef03f71727b1b14ee39 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 10 Sep 2018 16:05:51 +0530 Subject: [PATCH] Update CLI help and README --- README.md | 78 ++++++++++++++++++++++++++++++++++++ camelot/cli.py | 96 +++++++++++++++++++++++++++++---------------- camelot/io.py | 2 +- camelot/plotting.py | 2 +- 4 files changed, 142 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 7ebb8bf..4ad019f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat ## Usage +### API +
 >>> import camelot
 >>> tables = camelot.read_pdf("foo.pdf")
@@ -23,6 +25,82 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
 >>> df = tables[0].df
 
+### Command-line interface + +
+Usage: camelot [OPTIONS] FILEPATH
+
+Options:
+  -p, --pages TEXT                Comma-separated page numbers to parse.
+                                  Example: 1,3,4 or 1,4-end
+  -o, --output TEXT               Output filepath.
+  -f, --format [csv|json|excel|html]
+                                  Output file format.
+  -z, --zip                       Whether or not to create a ZIP archive.
+  -m, --mesh                      Whether or not to use Lattice method of
+                                  parsing. Stream is used by default.
+  -T, --table_area TEXT           Table areas (x1,y1,x2,y2) to process.
+                                  x1, y1
+                                  -> left-top and x2, y2 -> right-bottom
+  -split, --split_text            Whether or not to split text if it spans
+                                  across multiple cells.
+  -flag, --flag_size              (inactive) Whether or not to flag text which
+                                  has uncommon size. (Useful to detect
+                                  super/subscripts)
+  -M, --margins ...
+                                  char_margin, line_margin, word_margin for
+                                  PDFMiner.
+  -C, --columns TEXT              x-coordinates of column separators.
+  -r, --row_close_tol INTEGER     Rows will be formed by combining text
+                                  vertically within this tolerance.
+  -c, --col_close_tol INTEGER     Columns will be formed by combining text
+                                  horizontally within this tolerance.
+  -back, --process_background     (with --mesh) Whether or not to process
+                                  lines that are in background.
+  -scale, --line_size_scaling INTEGER
+                                  (with --mesh) Factor by which the page
+                                  dimensions will be divided to get smallest
+                                  length of detected lines.
+  -copy, --copy_text [h|v]        (with --mesh) Specify direction in which
+                                  text will be copied over in a spanning cell.
+  -shift, --shift_text [l|r|t|b]  (with --mesh) Specify direction in which
+                                  text in a spanning cell should flow.
+  -l, --line_close_tol INTEGER    (with --mesh) Tolerance parameter used to
+                                  merge close vertical lines and close
+                                  horizontal lines.
+  -j, --joint_close_tol INTEGER   (with --mesh) Tolerance parameter used to
+                                  decide whether the detected lines and points
+                                  lie close to each other.
+  -block, --threshold_blocksize INTEGER
+                                  (with --mesh) For adaptive thresholding,
+                                  size of a pixel neighborhood that is used to
+                                  calculate a threshold value for the pixel:
+                                  3, 5, 7, and so on.
+  -const, --threshold_constant INTEGER
+                                  (with --mesh) For adaptive thresholding,
+                                  constant subtracted from the mean or
+                                  weighted mean.
+                                  Normally, it is positive but
+                                  may be zero or negative as well.
+  -I, --iterations INTEGER        (with --mesh) Number of times for
+                                  erosion/dilation is applied.
+  -G, --geometry_type [text|table|contour|joint|line]
+                                  Plot geometry found on pdf page for
+                                  debugging.
+
+                                  text: Plot text objects. (Useful
+                                  to get table_area and columns coordinates)
+                                  table: Plot parsed table.
+                                  contour (with
+                                  --mesh): Plot detected rectangles.
+                                  joint
+                                  (with --mesh): Plot detected line
+                                  intersections.
+                                  line (with --mesh): Plot
+                                  detected lines.
+  --help                          Show this message and exit.
+
+ ## Dependencies The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/). diff --git a/camelot/cli.py b/camelot/cli.py index 61cf3d8..709bfc3 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -17,44 +17,69 @@ class Mutex(click.Option): @click.command() -@click.option("-p", "--pages", default="1", help="") -@click.option("-o", "--output", help="") +@click.option("-p", "--pages", default="1", help="Comma-separated page numbers" + " to parse. Example: 1,3,4 or 1,4-end") +@click.option("-o", "--output", help="Output filepath.") @click.option("-f", "--format", - type=click.Choice(["csv", "json", "excel", "html"]), help="") -@click.option("-z", "--zip", is_flag=True, help="") + type=click.Choice(["csv", "json", "excel", "html"]), + help="Output file format.") +@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP" + " archive.") @click.option("-m", "--mesh", is_flag=True, help="Whether or not to" - "use Lattice method of parsing. Stream is used by default.") + " use Lattice method of parsing. Stream is used by default.") +@click.option("-T", "--table_area", default=[], multiple=True, + help="Table areas (x1,y1,x2,y2) to process.\n" + " x1, y1 -> left-top and x2, y2 -> right-bottom") +@click.option("-split", "--split_text", is_flag=True, help="Whether or not to" + " split text if it spans across multiple cells.") +@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or" + " not to flag text which has uncommon size. (Useful to detect" + " super/subscripts)") +@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), + help="char_margin, line_margin, word_margin for PDFMiner.") +@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, + help="x-coordinates of column separators.") +@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be" + " formed by combining text vertically within this tolerance.") +@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will" + " be formed by combining text horizontally within this tolerance.") +@click.option("-back", "--process_background", is_flag=True, cls=Mutex, + help="(with --mesh) Whether or not to process lines that are in" + " background.") +@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex, + help="(with --mesh) Factor by which the page dimensions will be" + " divided to get smallest length of detected lines.") +@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]), + multiple=True, cls=Mutex, help="(with --mesh) Specify direction" + " in which text will be copied over in a spanning cell.") +@click.option("-shift", "--shift_text", default=["l", "t"], + type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex, + help="(with --mesh) Specify direction in which text in a spanning" + " cell should flow.") +@click.option("-l", "--line_close_tol", default=2, cls=Mutex, + help="(with --mesh) Tolerance parameter used to merge close vertical" + " lines and close horizontal lines.") +@click.option("-j", "--joint_close_tol", default=2, cls=Mutex, + help="(with --mesh) Tolerance parameter used to decide whether" + " the detected lines and points lie close to each other.") +@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex, + help="(with --mesh) For adaptive thresholding, size of a pixel" + " neighborhood that is used to calculate a threshold value for" + " the pixel: 3, 5, 7, and so on.") +@click.option("-const", "--threshold_constant", default=-2, cls=Mutex, + help="(with --mesh) For adaptive thresholding, constant subtracted" + " from the mean or weighted mean.\nNormally, it is positive but" + " may be zero or negative as well.") +@click.option("-I", "--iterations", default=0, cls=Mutex, + help="(with --mesh) Number of times for erosion/dilation is" + " applied.") @click.option("-G", "--geometry_type", type=click.Choice(["text", "table", "contour", "joint", "line"]), - help="Plot geometry found on pdf page for debugging.") -@click.option("-T", "--table_area", default=[], multiple=True, - help="") -@click.option("-split", "--split_text", is_flag=True, help="") -@click.option("-flag", "--flag_size", is_flag=True, help="") -@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), - help="") -@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, - help="") -@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="") -@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="") -@click.option("-back", "--process_background", is_flag=True, cls=Mutex, - help="Use with --mesh") -@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex, - help="Use with --mesh") -@click.option("-copy", "--copy_text", default=[], cls=Mutex, - help="Use with --mesh") -@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex, - help="Use with --mesh") -@click.option("-l", "--line_close_tol", default=2, cls=Mutex, - help="Use with --mesh") -@click.option("-j", "--joint_close_tol", default=2, cls=Mutex, - help="Use with --mesh") -@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex, - help="Use with --mesh") -@click.option("-const", "--threshold_constant", default=-2, cls=Mutex, - help="Use with --mesh") -@click.option("-I", "--iterations", default=0, cls=Mutex, - help="Use with --mesh") + help="Plot geometry found on pdf page for debugging.\n\n" + "text: Plot text objects. (Useful to get table_area and" + " columns coordinates)\ntable: Plot parsed table.\n" + "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line" + " intersections.\nline (with --mesh): Plot detected lines.") @click.argument("filepath", type=click.Path(exists=True)) def cli(*args, **kwargs): pages = kwargs.pop("pages") @@ -69,6 +94,9 @@ def cli(*args, **kwargs): kwargs['table_area'] = None if not table_area else table_area columns = list(kwargs['columns']) kwargs['columns'] = None if not columns else columns + copy_text = list(kwargs['copy_text']) + kwargs['copy_text'] = None if not copy_text else copy_text + kwargs['shift_text'] = list(kwargs['shift_text']) kwargs = remove_extra(kwargs, mesh=mesh) if geometry_type is None: diff --git a/camelot/io.py b/camelot/io.py index a213cee..8297253 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -19,7 +19,7 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs): Whether or not to use Lattice method of parsing. Stream is used by default. table_area : list, optional (default: None) - List of table areas to analyze as strings of the form + List of table areas to process as strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in pdf coordinate space. columns^ : list, optional (default: None) diff --git a/camelot/plotting.py b/camelot/plotting.py index 6012217..23757e3 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -32,7 +32,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs) 'joint'* : Plot detected line intersections. 'line'* : Plot detected lines. table_area : list, optional (default: None) - List of table areas to analyze as strings of the form + List of table areas to process as strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in pdf coordinate space. columns^ : list, optional (default: None)