diff --git a/.gitignore b/.gitignore index 4fd453c..bfa45c7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,3 @@ dist/ .pytest_cache/ _build/ -_static/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..7a29aed --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +Be cordial or be on your way. -- Kenneth Reitz + +https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..20940fd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,125 @@ +# Contributor's Guide + +If you're reading this, you're probably looking to contributing to Camelot. *Time is the only real currency*, and the fact that you're considering spending some here is *very* generous of you. Thanks you very much! + +This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to [Vinayak Mehta](http://vinayak-mehta.github.io), the author and maintainer. + +## Code Of Conduct + +The following quote sums up the **Code Of Conduct**. + + **Be cordial or be on your way**. *--Kenneth Reitz* + +Kenneth Reitz has also written an [essay](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way) on this topic, which you should read. + +As the [Requests Code Of Conduct](http://docs.python-requests.org/en/master/dev/contributing/#be-cordial) states, **all contributions are welcome**, as long as everyone involved is treated with respect. + +## Your First Contribution + +A great way to start contributing to Camelot is to pick an issue tagged with the [Contributor Friendly](https://github.com/socialcopsdev/camelot/labels/Contributor%20Friendly) tag or the [Level: Easy](https://github.com/socialcopsdev/camelot/labels/Level%3A%20Easy) tag. If you're unable to find a good first issue, feel free to contact the maintainer. + +## Setting up a development environment + +To install the dependencies needed for development, you can use pip: + +
+$ pip install camelot-py[dev]
+
+ +## Pull Requests + +### Submit a Pull Request + +The preferred workflow for contributing to Camelot is to fork the [project repository](https://github.com/socialcopsdev/camelot) on GitHub, clone, develop on a branch and then finally submit a pull request. Steps: + +1. Fork the project repository: click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub. + +2. Clone your fork of the Camelot from your GitHub account: + +
+$ git clone https://www.github.com/[username]/camelot
+
+ +3. Create a branch to hold your changes: + +
+$ git checkout -b my-feature
+
+ +Always branch out from `master` to work on your contribution. It's good practice to never work on the `master` branch! + +**Protip: `git stash` is a great way to save the work that you haven't committed yet, to move between branches.** + +4. Work on your contribution. Add changed files using `git add` and then `git commit` them: + +
+$ git add modified_files
+$ git commit
+
+ +5. Finally, push them to your GitHub fork: + +
+$ git push -u origin my-feature
+
+ +Now it's time to go to the your fork of Camelot and create a pull request! You can [follow these instructions](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) to do the same. + +### Work on your Pull Request + +We recommend that your pull request complies with the following rules: + +- Make sure your code follows [pep8](http://pep8.org). + +- In case your pull request contains function docstrings, make sure you follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) format. All function docstrings in Camelot follow this format. Moreover, following the format will make sure that the API documentation is generated flawlessly. + +- Make sure your commit messages follow [the seven rules of a great git commit message](https://chris.beams.io/posts/git-commit/). + - Separate subject from body with a blank line + - Limit the subject line to 50 characters + - Capitalize the subject line + - Do not end the subject line with a period + - Use the imperative mood in the subject line + - Wrap the body at 72 characters + - Use the body to explain what and why vs. how + +- Please prefix your title of your pull request with [MRG] (Ready for Merge), if the contribution is complete and ready for a detailed review. An incomplete pull request's title should be prefixed with [WIP] (to indicate a work in progress), and changed to [MRG] when it's complete. A good [task list](https://blog.github.com/2013-01-09-task-lists-in-gfm-issues-pulls-comments/) in the PR description will ensure that other people will get a better idea of what it proposes to do, which will also increase collaboration. + +- If contributing new functionality, make sure that you add a unit test for it, while making sure that all previous tests pass. Camelot uses [pytest](https://docs.pytest.org/en/latest/) for testing. Tests can be run using: + +
+$ python setup.py test
+
+ +## Writing Documentation + +Writing documentation, function docstrings, examples and tutorials is a great way to start contributing to open-source software! The documentation is present inside the `docs/` directory of the source code repository. + +The documentation is written in [reStructuredText](https://en.wikipedia.org/wiki/ReStructuredText), with [Sphinx](http://www.sphinx-doc.org/en/master/) used to generate these lovely HTML files that you're currently reading (unless you're reading this on GitHub). You can edit the documentation using any text editor and then generate the HTML output by running `make html` in the `docs/` directory. + +The function docstrings are written using the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) extension for Sphinx. Make sure you check it out before you start writing one. + +## Filing Issues + +We use [GitHub issues](https://docs.pytest.org/en/latest/) to keep track of all issues and pull requests. Before opening an issue (which asks a question or reports a bug), it is advisable to use GitHub search to look for existing issues (both open and closed) that may be similar. + +### Questions + +Please don't use GitHub issues for support questions, a better place for them would be [Stack Overflow](http://stackoverflow.com). Make sure you tag them using the `python-camelot` tag. + +### Bug Reports + +- Please include your operating system type and Python version number, along with the version numbers of NumPy, OpenCV and Camelot. You can use the following code snippet to find this information: + +
+import platform; print(platform.platform())
+import sys; print('Python', sys.version)
+import numpy; print('NumPy', numpy.__version__)
+import cv2; print('OpenCV', cv2.__version__)
+import camelot; print('Camelot', camelot.__version__)
+
+ +- Please include the **complete traceback** in your bug report. + +- Make sure you include **steps to reproduce the bug**, using code snippets. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks/). + +- Also include a link to the PDF document that you were trying to extract tables from, telling us what you expected the code to do and what actually happened. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1dc3072 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright (c) 2018 Peeply Private Ltd (Singapore) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 68630f0..efd5a4e 100644 --- a/README.md +++ b/README.md @@ -1,168 +1,97 @@ # Camelot: PDF Table Parsing for Humans -Camelot is a Python library and command-line tool for extracting tables from PDF files. +![license](https://img.shields.io/badge/license-MIT-lightgrey.svg) ![python-version](https://img.shields.io/badge/python-2.7-blue.svg) -## Usage +**Camelot** is a Python library which makes it easy for *anyone* to extract tables from PDF files! -### API +--- + +**Here's how you can extract tables from PDF files.** Check out the PDF used in this example, [here](docs/_static/pdf/foo.pdf).
 >>> import camelot
->>> tables = camelot.read_pdf("foo.pdf")
+>>> tables = camelot.read_pdf('foo.pdf', mesh=True)
 >>> tables
-<TableList n=2>
->>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
+<TableList tables=1>
+>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
 >>> tables[0]
-<Table shape=(3,4)>
->>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
+<Table shape=(7, 7)>
 >>> tables[0].parsing_report
 {
-    "accuracy": 96,
-    "whitespace": 80,
-    "order": 1,
-    "page": 1
+    'accuracy': 99.02,
+    'whitespace': 12.24,
+    'order': 1,
+    'page': 1
 }
->>> df = tables[0].df
+>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
+>>> tables[0].df # get a pandas DataFrame!
 
-### Command-line interface +| Cycle Name | KI (1/km) | Distance (mi) | Percent Fuel Savings | | | | +|------------|-----------|---------------|----------------------|-----------------|-----------------|----------------| +| | | | Improved Speed | Decreased Accel | Eliminate Stops | Decreased Idle | +| 2012_2 | 3.30 | 1.3 | 5.9% | 9.5% | 29.2% | 17.4% | +| 2145_1 | 0.68 | 11.2 | 2.4% | 0.1% | 9.5% | 2.7% | +| 4234_1 | 0.59 | 58.7 | 8.5% | 1.3% | 8.5% | 3.3% | +| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | +| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | -
-Usage: camelot [OPTIONS] FILEPATH
+There's a [command-line interface]() too!
 
-Options:
-  -p, --pages TEXT                Comma-separated page numbers to parse.
-                                  Example: 1,3,4 or 1,4-end
-  -o, --output TEXT               Output filepath.
-  -f, --format [csv|json|excel|html]
-                                  Output file format.
-  -z, --zip                       Whether or not to create a ZIP archive.
-  -m, --mesh                      Whether or not to use Lattice method of
-                                  parsing. Stream is used by default.
-  -T, --table_area TEXT           Table areas (x1,y1,x2,y2) to process.
-                                  x1, y1
-                                  -> left-top and x2, y2 -> right-bottom
-  -split, --split_text            Whether or not to split text if it spans
-                                  across multiple cells.
-  -flag, --flag_size              (inactive) Whether or not to flag text which
-                                  has uncommon size. (Useful to detect
-                                  super/subscripts)
-  -M, --margins <FLOAT FLOAT FLOAT>...
-                                  char_margin, line_margin, word_margin for
-                                  PDFMiner.
-  -C, --columns TEXT              x-coordinates of column separators.
-  -r, --row_close_tol INTEGER     Rows will be formed by combining text
-                                  vertically within this tolerance.
-  -c, --col_close_tol INTEGER     Columns will be formed by combining text
-                                  horizontally within this tolerance.
-  -back, --process_background     (with --mesh) Whether or not to process
-                                  lines that are in background.
-  -scale, --line_size_scaling INTEGER
-                                  (with --mesh) Factor by which the page
-                                  dimensions will be divided to get smallest
-                                  length of detected lines.
-  -copy, --copy_text [h|v]        (with --mesh) Specify direction in which
-                                  text will be copied over in a spanning cell.
-  -shift, --shift_text [l|r|t|b]  (with --mesh) Specify direction in which
-                                  text in a spanning cell should flow.
-  -l, --line_close_tol INTEGER    (with --mesh) Tolerance parameter used to
-                                  merge close vertical lines and close
-                                  horizontal lines.
-  -j, --joint_close_tol INTEGER   (with --mesh) Tolerance parameter used to
-                                  decide whether the detected lines and points
-                                  lie close to each other.
-  -block, --threshold_blocksize INTEGER
-                                  (with --mesh) For adaptive thresholding,
-                                  size of a pixel neighborhood that is used to
-                                  calculate a threshold value for the pixel:
-                                  3, 5, 7, and so on.
-  -const, --threshold_constant INTEGER
-                                  (with --mesh) For adaptive thresholding,
-                                  constant subtracted from the mean or
-                                  weighted mean.
-                                  Normally, it is positive but
-                                  may be zero or negative as well.
-  -I, --iterations INTEGER        (with --mesh) Number of times for
-                                  erosion/dilation is applied.
-  -G, --geometry_type [text|table|contour|joint|line]
-                                  Plot geometry found on pdf page for
-                                  debugging.
+## Why Camelot?
 
-                                  text: Plot text objects. (Useful
-                                  to get table_area and columns coordinates)
-                                  table: Plot parsed table.
-                                  contour (with
-                                  --mesh): Plot detected rectangles.
-                                  joint
-                                  (with --mesh): Plot detected line
-                                  intersections.
-                                  line (with --mesh): Plot
-                                  detected lines.
-  --help                          Show this message and exit.
-
+- **You are in control**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (Since everything in the real world, including PDF table extraction, is fuzzy.) +- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table. +- Each table is a **pandas DataFrame**, which enables seamless integration into data analysis workflows. +- **Export** to multiple formats, including json, excel and html. +- Simple and Elegant API, written in **Python**! -## Dependencies - -The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/). +See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools). ## Installation -Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by +After [installing the dependencies](), you can simply use pip to install Camelot:
-pip install -U pip setuptools
+$ pip install camelot-py
 
-### Installing dependencies +## Documentation -tk and ghostscript can be installed using your system's default package manager. - -#### Linux - -* Ubuntu - -
-sudo apt-get install python-tk ghostscript
-
- -* Arch Linux - -
-sudo pacman -S tk ghostscript
-
- -#### OS X - -
-brew install tcl-tk ghostscript
-
- -Finally, `cd` into the project directory and install by - -
-python setup.py install
-
+Great documentation is available at [link](). ## Development -### Code +The [Contributor's Guide](CONTRIBUTING.md) has detailed information about contributing code, documentation, tests and more. We've included some basic information in this README. -You can check the latest sources with the command: +### Source code + +You can check the latest sources with:
-git clone https://github.com/socialcopsdev/camelot.git
+$ git clone https://www.github.com/socialcopsdev/camelot
 
-### Contributing +### Setting up a development environment -See [Contributing guidelines](). +You can install the development dependencies easily, using pip: + +
+$ pip install camelot-py[dev]
+
### Testing +After installation, you can run tests using: +
-python setup.py test
+$ python setup.py test
 
+## Versioning + +Camelot uses [Semantic Versioning](https://semver.org/). For the available versions, see the tags on this repository. + ## License -BSD License +This project is licensed under the MIT License, see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/camelot/cli.py b/camelot/cli.py index 709bfc3..98bb681 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -3,6 +3,7 @@ from pprint import pprint import click +from . import __version__ from .io import read_pdf from .plotting import plot_geometry from .utils import validate_input, remove_extra @@ -17,6 +18,7 @@ class Mutex(click.Option): @click.command() +@click.version_option(version=__version__) @click.option("-p", "--pages", default="1", help="Comma-separated page numbers" " to parse. Example: 1,3,4 or 1,4-end") @click.option("-o", "--output", help="Output filepath.") @@ -53,7 +55,7 @@ class Mutex(click.Option): multiple=True, cls=Mutex, help="(with --mesh) Specify direction" " in which text will be copied over in a spanning cell.") @click.option("-shift", "--shift_text", default=["l", "t"], - type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex, + type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex, help="(with --mesh) Specify direction in which text in a spanning" " cell should flow.") @click.option("-l", "--line_close_tol", default=2, cls=Mutex, diff --git a/camelot/core.py b/camelot/core.py index e09dc1e..22b7442 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -46,7 +46,6 @@ class Cell(object): Whether or not cell spans vertically. text : string Text assigned to cell. - bound """ @@ -101,8 +100,7 @@ class Table(object): Attributes ---------- - df : object - pandas.DataFrame + df : :class:`pandas.DataFrame` shape : tuple Shape of the table. accuracy : float @@ -113,8 +111,6 @@ class Table(object): Table number on pdf page. page : int Pdf page number. - data - parsing_report """ def __init__(self, cols, rows): @@ -143,13 +139,13 @@ class Table(object): @property def parsing_report(self): - """Returns a parsing report with accuracy, %whitespace, + """Returns a parsing report with %accuracy, %whitespace, table number on page and page number. """ # pretty? report = { - 'accuracy': self.accuracy, - 'whitespace': self.whitespace, + 'accuracy': round(self.accuracy, 2), + 'whitespace': round(self.whitespace, 2), 'order': self.order, 'page': self.page } @@ -317,27 +313,41 @@ class Table(object): cell.vspan = True elif top and bottom and (not left and not right): cell.hspan = True + elif cell.bound in [0, 1]: + cell.vspan = True + cell.hspan = True return self def to_csv(self, path, **kwargs): - """Write Table to a comma-separated values (csv) file. + """Writes Table to a comma-separated values (csv) file. + + For kwargs, check :meth:`pandas.DataFrame.to_csv`. + + Parameters + ---------- + path : str + Output filepath. - Check `pandas.DataFrame.to_csv `_ - kwargs for more details around what kwargs to use. """ kw = { 'encoding': 'utf-8', 'index': False, + 'header': False, 'quoting': 1 } kw.update(kwargs) self.df.to_csv(path, **kw) def to_json(self, path, **kwargs): - """Write Table to a JSON file. + """Writes Table to a JSON file. + + For kwargs, check :meth:`pandas.DataFrame.to_json`. + + Parameters + ---------- + path : str + Output filepath. - Check `pandas.DataFrame.to_json `_ - kwargs for more details around what kwargs to use. """ kw = { 'orient': 'records' @@ -348,10 +358,15 @@ class Table(object): f.write(json_string) def to_excel(self, path, **kwargs): - """Write Table to an Excel file. + """Writes Table to an Excel file. + + For kwargs, check :meth:`pandas.DataFrame.to_excel`. + + Parameters + ---------- + path : str + Output filepath. - Check `pandas.DataFrame.to_excel `_ - kwargs for more details around what kwargs to use. """ kw = { 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), @@ -363,10 +378,15 @@ class Table(object): writer.save() def to_html(self, path, **kwargs): - """Write Table to an HTML file. + """Writes Table to an HTML file. + + For kwargs, check :meth:`pandas.DataFrame.to_html`. + + Parameters + ---------- + path : str + Output filepath. - Check `pandas.DataFrame.to_html `_ - kwargs for more details around what kwargs to use. """ html_string = self.df.to_html(**kwargs) with open(path, 'w') as f: @@ -434,7 +454,7 @@ class TableList(object): Parameters ---------- path : str - Filepath + Output filepath. f : str File format. Can be csv, json, excel and html. compress : bool diff --git a/camelot/handlers.py b/camelot/handlers.py index 516cc3b..59b31c3 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -13,8 +13,8 @@ class PDFHandler(object): file into single page pdfs, parsing each pdf and then removing the temp directory. - Parameter - --------- + Parameters + ---------- filename : str Path to pdf file. pages : str @@ -81,6 +81,8 @@ class PDFHandler(object): """ with open(filename, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) + if infile.isEncrypted: + infile.decrypt('') fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) @@ -98,6 +100,8 @@ class PDFHandler(object): fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + if infile.isEncrypted: + infile.decrypt('') outfile = PdfFileWriter() p = infile.getPage(0) if rotation == 'anticlockwise': diff --git a/camelot/io.py b/camelot/io.py index 8297253..328b107 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -30,8 +30,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs): multiple cells. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts) row_close_tol^ : int, optional (default: 2) Rows will be formed by combining text vertically within this tolerance. @@ -61,24 +61,24 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs): joint_close_tol* : int, optional (default: 2) Tolerance parameter used to decide whether the detected lines and points lie close to each other. - threshold_blocksize : int, optional (default: 15) + threshold_blocksize* : int, optional (default: 15) Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. For more information, refer `OpenCV's adaptiveThreshold `_. - threshold_constant : int, optional (default: -2) + threshold_constant* : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. For more information, refer `OpenCV's adaptiveThreshold `_. - iterations : int, optional (default: 0) + iterations* : int, optional (default: 0) Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. margins : tuple PDFMiner margins. (char_margin, line_margin, word_margin) - For for information, refer `PDFMiner docs `_. + For more information, refer `PDFMiner docs `_. Returns ------- diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 40a9040..9e569ab 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -51,8 +51,8 @@ class Lattice(BaseParser): multiple cells. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts) line_close_tol : int, optional (default: 2) Tolerance parameter used to merge vertical and horizontal detected lines which lie close to each other. @@ -76,7 +76,7 @@ class Lattice(BaseParser): margins : tuple PDFMiner margins. (char_margin, line_margin, word_margin) - For for information, refer `PDFMiner docs `_. + For more information, refer `PDFMiner docs `_. debug : bool, optional (default: False) Whether or not to return all text objects on the page which can be used to generate a matplotlib plot, to get diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index f547bf0..6d29a05 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -35,8 +35,8 @@ class Stream(BaseParser): multiple cells. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts) row_close_tol : int, optional (default: 2) Rows will be formed by combining text vertically within this tolerance. @@ -46,7 +46,7 @@ class Stream(BaseParser): margins : tuple, optional (default: (1.0, 0.5, 0.1)) PDFMiner margins. (char_margin, line_margin, word_margin) - For for information, refer `PDFMiner docs `_. + For more information, refer `PDFMiner docs `_. debug : bool, optional (default: False) Whether or not to return all text objects on the page which can be used to generate a matplotlib plot, to get @@ -294,8 +294,7 @@ class Stream(BaseParser): if ncols == 1: logger.info("No tables found on {}".format( os.path.basename(self.rootname))) - cols = [(t.x0, t.x1) - for r in rows_grouped if len(r) == ncols for t in r] + cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) inner_text = [] for i in range(1, len(cols)): diff --git a/camelot/plotting.py b/camelot/plotting.py index 23757e3..7a94b53 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -25,12 +25,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs) Whether or not to use Lattice method of parsing. Stream is used by default. geometry_type : str, optional (default: None) - 'text' : Plot text objects found on page, useful to get - table_area and columns coordinates. - 'table' : Plot parsed table. - 'contour'* : Plot detected rectangles. - 'joint'* : Plot detected line intersections. - 'line'* : Plot detected lines. + * 'text' : Plot text objects found on page. (Useful to get \ + table_area and columns coordinates) + * 'table' : Plot parsed table. + * 'contour'* : Plot detected rectangles. + * 'joint'* : Plot detected line intersections. + * 'line'* : Plot detected lines. table_area : list, optional (default: None) List of table areas to process as strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and @@ -43,8 +43,8 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs) multiple cells. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts.) row_close_tol^ : int, optional (default: 2) Rows will be formed by combining text vertically within this tolerance. @@ -74,24 +74,24 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs) joint_close_tol* : int, optional (default: 2) Tolerance parameter used to decide whether the detected lines and points lie close to each other. - threshold_blocksize : int, optional (default: 15) + threshold_blocksize* : int, optional (default: 15) Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. For more information, refer `OpenCV's adaptiveThreshold `_. - threshold_constant : int, optional (default: -2) + threshold_constant* : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. For more information, refer `OpenCV's adaptiveThreshold `_. - iterations : int, optional (default: 0) + iterations* : int, optional (default: 0) Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. margins : tuple PDFMiner margins. (char_margin, line_margin, word_margin) - For for information, refer `PDFMiner docs `_. + For more information, refer `PDFMiner docs `_. """ validate_input(kwargs, mesh=mesh, geometry_type=geometry_type) @@ -141,7 +141,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs) for img, table_bbox in geometry.images: for t in table_bbox.keys(): cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 3) + (t[2], t[3]), (255, 0, 0), 20) plt.imshow(img) plt.show() elif geometry_type == 'joint': diff --git a/camelot/utils.py b/camelot/utils.py index 815f87d..c0f4a59 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -454,8 +454,8 @@ def split_textline(table, textline, direction, flag_size=False): Direction of the PDFMiner LTTextLine object. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts.) Returns ------- @@ -530,8 +530,8 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): multiple cells. flag_size : bool, optional (default: False) Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. + if its size is different from rest of the string. (Useful for + super and subscripts) Returns ------- diff --git a/docs/assets/camelot.png b/docs/_static/camelot.png similarity index 100% rename from docs/assets/camelot.png rename to docs/_static/camelot.png diff --git a/docs/_static/csv/background_lines.csv b/docs/_static/csv/background_lines.csv new file mode 100755 index 0000000..274bd62 --- /dev/null +++ b/docs/_static/csv/background_lines.csv @@ -0,0 +1,8 @@ +"State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" +"Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000" +"Rajasthan","2.12.2009 to 19.12.2009","","","","","","" +"Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453" +"Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153" +"Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183" +"Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855" +"Total","","47","92","11.81","22,455","19,584","10,644" diff --git a/docs/_static/csv/foo.csv b/docs/_static/csv/foo.csv new file mode 100644 index 0000000..8e956a3 --- /dev/null +++ b/docs/_static/csv/foo.csv @@ -0,0 +1,7 @@ +"Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" +"","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" +"2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" +"2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" +"4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" +"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" +"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" \ No newline at end of file diff --git a/docs/_static/csv/table_areas.csv b/docs/_static/csv/table_areas.csv new file mode 100755 index 0000000..13b1dbc --- /dev/null +++ b/docs/_static/csv/table_areas.csv @@ -0,0 +1,11 @@ +"","One Withholding" +"Payroll Period","Allowance" +"Weekly","$71.15" +"Biweekly","142.31" +"Semimonthly","154.17" +"Monthly","308.33" +"Quarterly","925.00" +"Semiannually","1,850.00" +"Annually","3,700.00" +"Daily or Miscellaneous","14.23" +"(each day of the payroll period)","" diff --git a/docs/assets/favicon.ico b/docs/_static/favicon.ico similarity index 100% rename from docs/assets/favicon.ico rename to docs/_static/favicon.ico diff --git a/docs/_static/pdf/background_lines.pdf b/docs/_static/pdf/background_lines.pdf new file mode 100755 index 0000000..f23d6b7 Binary files /dev/null and b/docs/_static/pdf/background_lines.pdf differ diff --git a/docs/_static/pdf/column_separators.pdf b/docs/_static/pdf/column_separators.pdf new file mode 100755 index 0000000..cecd7b6 Binary files /dev/null and b/docs/_static/pdf/column_separators.pdf differ diff --git a/docs/_static/pdf/copy_text.pdf b/docs/_static/pdf/copy_text.pdf new file mode 100644 index 0000000..39bce84 Binary files /dev/null and b/docs/_static/pdf/copy_text.pdf differ diff --git a/docs/_static/pdf/foo.pdf b/docs/_static/pdf/foo.pdf new file mode 100644 index 0000000..742e018 Binary files /dev/null and b/docs/_static/pdf/foo.pdf differ diff --git a/docs/_static/pdf/group_rows.pdf b/docs/_static/pdf/group_rows.pdf new file mode 100755 index 0000000..46cd236 Binary files /dev/null and b/docs/_static/pdf/group_rows.pdf differ diff --git a/docs/_static/pdf/rotated.pdf b/docs/_static/pdf/rotated.pdf new file mode 100755 index 0000000..8b7a615 Binary files /dev/null and b/docs/_static/pdf/rotated.pdf differ diff --git a/docs/_static/pdf/short_lines.pdf b/docs/_static/pdf/short_lines.pdf new file mode 100755 index 0000000..5cab903 Binary files /dev/null and b/docs/_static/pdf/short_lines.pdf differ diff --git a/docs/_static/pdf/superscript.pdf b/docs/_static/pdf/superscript.pdf new file mode 100755 index 0000000..855a3bd Binary files /dev/null and b/docs/_static/pdf/superscript.pdf differ diff --git a/docs/_static/pdf/table_areas.pdf b/docs/_static/pdf/table_areas.pdf new file mode 100755 index 0000000..45b3de3 Binary files /dev/null and b/docs/_static/pdf/table_areas.pdf differ diff --git a/docs/_static/png/background_lines.png b/docs/_static/png/background_lines.png new file mode 100755 index 0000000..5165312 Binary files /dev/null and b/docs/_static/png/background_lines.png differ diff --git a/docs/assets/columns.png b/docs/_static/png/columns.png similarity index 100% rename from docs/assets/columns.png rename to docs/_static/png/columns.png diff --git a/docs/assets/contour.png b/docs/_static/png/geometry_contour.png similarity index 100% rename from docs/assets/contour.png rename to docs/_static/png/geometry_contour.png diff --git a/docs/assets/intersection.png b/docs/_static/png/geometry_joint.png similarity index 100% rename from docs/assets/intersection.png rename to docs/_static/png/geometry_joint.png diff --git a/docs/assets/line.png b/docs/_static/png/geometry_line.png similarity index 100% rename from docs/assets/line.png rename to docs/_static/png/geometry_line.png diff --git a/docs/assets/table_span.png b/docs/_static/png/geometry_table.png similarity index 100% rename from docs/assets/table_span.png rename to docs/_static/png/geometry_table.png diff --git a/docs/_static/png/geometry_text.png b/docs/_static/png/geometry_text.png new file mode 100755 index 0000000..06dd575 Binary files /dev/null and b/docs/_static/png/geometry_text.png differ diff --git a/docs/assets/lattice.png b/docs/_static/png/lattice.png similarity index 100% rename from docs/assets/lattice.png rename to docs/_static/png/lattice.png diff --git a/docs/assets/lattice_all.png b/docs/_static/png/lattice_all.png similarity index 100% rename from docs/assets/lattice_all.png rename to docs/_static/png/lattice_all.png diff --git a/docs/assets/lattice_all_ex.png b/docs/_static/png/lattice_all_ex.png similarity index 100% rename from docs/assets/lattice_all_ex.png rename to docs/_static/png/lattice_all_ex.png diff --git a/docs/assets/lattice_rc.png b/docs/_static/png/lattice_rc.png similarity index 100% rename from docs/assets/lattice_rc.png rename to docs/_static/png/lattice_rc.png diff --git a/docs/assets/lattice_rc_ex.png b/docs/_static/png/lattice_rc_ex.png similarity index 100% rename from docs/assets/lattice_rc_ex.png rename to docs/_static/png/lattice_rc_ex.png diff --git a/docs/_static/png/short_lines.png b/docs/_static/png/short_lines.png new file mode 100755 index 0000000..395e834 Binary files /dev/null and b/docs/_static/png/short_lines.png differ diff --git a/docs/_static/png/short_lines_1.png b/docs/_static/png/short_lines_1.png new file mode 100644 index 0000000..adbcf4f Binary files /dev/null and b/docs/_static/png/short_lines_1.png differ diff --git a/docs/_static/png/short_lines_2.png b/docs/_static/png/short_lines_2.png new file mode 100755 index 0000000..8eed12f Binary files /dev/null and b/docs/_static/png/short_lines_2.png differ diff --git a/docs/assets/stream1.png b/docs/_static/png/stream1.png similarity index 100% rename from docs/assets/stream1.png rename to docs/_static/png/stream1.png diff --git a/docs/assets/stream1_all.png b/docs/_static/png/stream1_all.png similarity index 100% rename from docs/assets/stream1_all.png rename to docs/_static/png/stream1_all.png diff --git a/docs/assets/stream1_page.png b/docs/_static/png/stream1_page.png similarity index 100% rename from docs/assets/stream1_page.png rename to docs/_static/png/stream1_page.png diff --git a/docs/assets/stream1_page_y.png b/docs/_static/png/stream1_page_y.png similarity index 100% rename from docs/assets/stream1_page_y.png rename to docs/_static/png/stream1_page_y.png diff --git a/docs/assets/stream1_rc.png b/docs/_static/png/stream1_rc.png similarity index 100% rename from docs/assets/stream1_rc.png rename to docs/_static/png/stream1_rc.png diff --git a/docs/assets/stream2.png b/docs/_static/png/stream2.png similarity index 100% rename from docs/assets/stream2.png rename to docs/_static/png/stream2.png diff --git a/docs/assets/stream2_all.png b/docs/_static/png/stream2_all.png similarity index 100% rename from docs/assets/stream2_all.png rename to docs/_static/png/stream2_all.png diff --git a/docs/assets/stream2_page.png b/docs/_static/png/stream2_page.png similarity index 100% rename from docs/assets/stream2_page.png rename to docs/_static/png/stream2_page.png diff --git a/docs/assets/stream2_page_y10_m8.png b/docs/_static/png/stream2_page_y10_m8.png similarity index 100% rename from docs/assets/stream2_page_y10_m8.png rename to docs/_static/png/stream2_page_y10_m8.png diff --git a/docs/assets/stream2_rc.png b/docs/_static/png/stream2_rc.png similarity index 100% rename from docs/assets/stream2_rc.png rename to docs/_static/png/stream2_rc.png diff --git a/docs/_static/png/superscript.png b/docs/_static/png/superscript.png new file mode 100755 index 0000000..d798aa7 Binary files /dev/null and b/docs/_static/png/superscript.png differ diff --git a/docs/assets/table.png b/docs/_static/png/table.png similarity index 100% rename from docs/assets/table.png rename to docs/_static/png/table.png diff --git a/docs/_templates/hacks.html b/docs/_templates/hacks.html new file mode 100644 index 0000000..90575ec --- /dev/null +++ b/docs/_templates/hacks.html @@ -0,0 +1,16 @@ + \ No newline at end of file diff --git a/docs/_templates/sidebarintro.html b/docs/_templates/sidebarintro.html new file mode 100644 index 0000000..111ac69 --- /dev/null +++ b/docs/_templates/sidebarintro.html @@ -0,0 +1,16 @@ + +

+ +

+ +

Useful Links

+ \ No newline at end of file diff --git a/docs/_templates/sidebarlogo.html b/docs/_templates/sidebarlogo.html new file mode 100644 index 0000000..072c269 --- /dev/null +++ b/docs/_templates/sidebarlogo.html @@ -0,0 +1,9 @@ + +

+ +

\ No newline at end of file diff --git a/docs/_themes/.gitignore b/docs/_themes/.gitignore new file mode 100644 index 0000000..3072e6f --- /dev/null +++ b/docs/_themes/.gitignore @@ -0,0 +1,2 @@ +*.pyc +*.pyo \ No newline at end of file diff --git a/docs/_themes/LICENSE b/docs/_themes/LICENSE new file mode 100644 index 0000000..8756c7a --- /dev/null +++ b/docs/_themes/LICENSE @@ -0,0 +1,37 @@ +Copyright (c) 2010 by Armin Ronacher. + +Some rights reserved. + +Redistribution and use in source and binary forms of the theme, with or +without modification, are permitted provided that the following conditions +are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +* The names of the contributors may not be used to endorse or + promote products derived from this software without specific + prior written permission. + +We kindly ask you to only use these themes in an unmodified manner just +for Flask and Flask-related products, not for unrelated projects. If you +like the visual style and want to use it for your own projects, please +consider making some larger changes to the themes (such as changing +font faces, sizes, colors or margins). + +THIS THEME IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS THEME, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/docs/_themes/flask_theme_support.py b/docs/_themes/flask_theme_support.py new file mode 100644 index 0000000..8a3278e --- /dev/null +++ b/docs/_themes/flask_theme_support.py @@ -0,0 +1,86 @@ +# flasky pygments style based on tango style +from pygments.style import Style +from pygments.token import Keyword, Name, Comment, String, Error, \ + Number, Operator, Generic, Whitespace, Punctuation, Other, Literal + + +class FlaskyStyle(Style): + background_color = "#f8f8f8" + default_style = "" + + styles = { + # No corresponding class for the following: + #Text: "", # class: '' + Whitespace: "underline #f8f8f8", # class: 'w' + Error: "#a40000 border:#ef2929", # class: 'err' + Other: "#000000", # class 'x' + + Comment: "italic #8f5902", # class: 'c' + Comment.Preproc: "noitalic", # class: 'cp' + + Keyword: "bold #004461", # class: 'k' + Keyword.Constant: "bold #004461", # class: 'kc' + Keyword.Declaration: "bold #004461", # class: 'kd' + Keyword.Namespace: "bold #004461", # class: 'kn' + Keyword.Pseudo: "bold #004461", # class: 'kp' + Keyword.Reserved: "bold #004461", # class: 'kr' + Keyword.Type: "bold #004461", # class: 'kt' + + Operator: "#582800", # class: 'o' + Operator.Word: "bold #004461", # class: 'ow' - like keywords + + Punctuation: "bold #000000", # class: 'p' + + # because special names such as Name.Class, Name.Function, etc. + # are not recognized as such later in the parsing, we choose them + # to look the same as ordinary variables. + Name: "#000000", # class: 'n' + Name.Attribute: "#c4a000", # class: 'na' - to be revised + Name.Builtin: "#004461", # class: 'nb' + Name.Builtin.Pseudo: "#3465a4", # class: 'bp' + Name.Class: "#000000", # class: 'nc' - to be revised + Name.Constant: "#000000", # class: 'no' - to be revised + Name.Decorator: "#888", # class: 'nd' - to be revised + Name.Entity: "#ce5c00", # class: 'ni' + Name.Exception: "bold #cc0000", # class: 'ne' + Name.Function: "#000000", # class: 'nf' + Name.Property: "#000000", # class: 'py' + Name.Label: "#f57900", # class: 'nl' + Name.Namespace: "#000000", # class: 'nn' - to be revised + Name.Other: "#000000", # class: 'nx' + Name.Tag: "bold #004461", # class: 'nt' - like a keyword + Name.Variable: "#000000", # class: 'nv' - to be revised + Name.Variable.Class: "#000000", # class: 'vc' - to be revised + Name.Variable.Global: "#000000", # class: 'vg' - to be revised + Name.Variable.Instance: "#000000", # class: 'vi' - to be revised + + Number: "#990000", # class: 'm' + + Literal: "#000000", # class: 'l' + Literal.Date: "#000000", # class: 'ld' + + String: "#4e9a06", # class: 's' + String.Backtick: "#4e9a06", # class: 'sb' + String.Char: "#4e9a06", # class: 'sc' + String.Doc: "italic #8f5902", # class: 'sd' - like a comment + String.Double: "#4e9a06", # class: 's2' + String.Escape: "#4e9a06", # class: 'se' + String.Heredoc: "#4e9a06", # class: 'sh' + String.Interpol: "#4e9a06", # class: 'si' + String.Other: "#4e9a06", # class: 'sx' + String.Regex: "#4e9a06", # class: 'sr' + String.Single: "#4e9a06", # class: 's1' + String.Symbol: "#4e9a06", # class: 'ss' + + Generic: "#000000", # class: 'g' + Generic.Deleted: "#a40000", # class: 'gd' + Generic.Emph: "italic #000000", # class: 'ge' + Generic.Error: "#ef2929", # class: 'gr' + Generic.Heading: "bold #000080", # class: 'gh' + Generic.Inserted: "#00A000", # class: 'gi' + Generic.Output: "#888", # class: 'go' + Generic.Prompt: "#745334", # class: 'gp' + Generic.Strong: "bold #000000", # class: 'gs' + Generic.Subheading: "bold #800080", # class: 'gu' + Generic.Traceback: "bold #a40000", # class: 'gt' + } \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst index 3bd0f3d..f6009cc 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,40 +1,34 @@ .. _api: -============= API Reference ============= -camelot.read_pdf -================ -.. automodule:: camelot.read_pdf - :members: +.. module:: camelot -camelot.handlers.PDFHandler -=========================== -.. automodule:: camelot.handlers.PDFHandler - :members: +Main Interface +-------------- +.. autofunction:: camelot.read_pdf +.. autofunction:: camelot.plot_geometry -camelot.parsers.Stream -====================== -.. automodule:: camelot.parsers.Stream - :members: +Lower-Level Classes +------------------- -camelot.parsers.Lattice -======================= -.. automodule:: camelot.parsers.Lattice - :members: +.. autoclass:: camelot.handlers.PDFHandler + :inherited-members: -camelot.core.Cell -================= -.. automodule:: camelot.core.Cell - :members: +.. autoclass:: camelot.parsers.Stream + :inherited-members: -camelot.core.Table -================== -.. automodule:: camelot.core.Table - :members: +.. autoclass:: camelot.parsers.Lattice + :inherited-members: -camelot.core.TableList -====================== -.. automodule:: camelot.core.TableList - :members: \ No newline at end of file +Lower-Lower-Level Classes +------------------------- + +.. autoclass:: camelot.core.TableList + :inherited-members: + +.. autoclass:: camelot.core.Table + :inherited-members: + +.. autoclass:: camelot.core.Cell \ No newline at end of file diff --git a/docs/assets/scale_1.png b/docs/assets/scale_1.png deleted file mode 100644 index e9023e0..0000000 Binary files a/docs/assets/scale_1.png and /dev/null differ diff --git a/docs/assets/scale_2.png b/docs/assets/scale_2.png deleted file mode 100644 index 798fd2a..0000000 Binary files a/docs/assets/scale_2.png and /dev/null differ diff --git a/docs/benchmark/lattice/agstat/agstat-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/agstat/agstat-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..2e16a02 --- /dev/null +++ b/docs/benchmark/lattice/agstat/agstat-data-camelot-page-1-table-1.csv @@ -0,0 +1,33 @@ +"Sl.No.","District","(In lakhs)for 2012-13Projected Population","Adult (In lakhs)Equivalent to 88%","requirement(In Lakh tonnes)Total Consumption(@ 400gms/adult/day)","(In Lakh tonnes)(Including seeds, feeds & wastage)Total Requirement","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)","" +"","","","","","","Kharif","Rabi","Total","Rice","Paddy" +"1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25" +"2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94" +"3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72" +"4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91" +"5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15" +"6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64" +"7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84" +"8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78" +"9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75" +"10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22" +"11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03" +"12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25" +"13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24" +"14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73" +"15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19" +"16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52" +"17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33" +"18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15" +"19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39" +"20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79" +"21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58" +"22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64" +"23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34" +"24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00" +"25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99" +"26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87" +"27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91" +"28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07" +"29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19" +"30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43" +"ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92" diff --git a/docs/benchmark/lattice/agstat/agstat-data-tabula.csv b/docs/benchmark/lattice/agstat/agstat-data-tabula.csv new file mode 100755 index 0000000..3696f06 --- /dev/null +++ b/docs/benchmark/lattice/agstat/agstat-data-tabula.csv @@ -0,0 +1,32 @@ +"Sl. No.",District,,,,,"Production (Rice) (In Lakh tonnes)","Surplus/Defi cit (In Lakh tonnes)",,, +"",,,,,,,,,, +1,Balasore,23.65,20.81,3.04,3.47,2.78,0.86,3.64,0.17,0.25 +2,Bhadrak,15.34,13.50,1.97,2.25,3.50,0.05,3.55,1.30,1.94 +3,Balangir,17.01,14.97,2.19,2.50,6.23,0.10,6.33,3.83,5.72 +4,Subarnapur,6.70,5.90,0.86,0.98,4.48,1.13,5.61,4.63,6.91 +5,Cuttack,26.63,23.43,3.42,3.91,3.75,0.06,3.81,-0.10,-0.15 +6,Jagatsingpur,11.49,10.11,1.48,1.69,2.10,0.02,2.12,0.43,0.64 +7,Jajpur,18.59,16.36,2.39,2.73,2.13,0.04,2.17,-0.56,-0.84 +8,Kendrapara,14.62,12.87,1.88,2.15,2.60,0.07,2.67,0.52,0.78 +9,Dhenkanal,12.13,10.67,1.56,1.78,2.26,0.02,2.28,0.50,0.75 +10,Angul,12.93,11.38,1.66,1.90,1.73,0.02,1.75,-0.15,-0.22 +11,Ganjam,35.77,31.48,4.60,5.26,4.57,0.00,4.57,-0.69,-1.03 +12,Gajapati,5.85,5.15,0.75,0.86,0.68,0.01,0.69,-0.17,-0.25 +13,Kalahandi,16.12,14.19,2.07,2.37,5.42,1.13,6.55,4.18,6.24 +14,Nuapada,6.18,5.44,0.79,0.90,1.98,0.08,2.06,1.16,1.73 +15,Keonjhar,18.42,16.21,2.37,2.71,2.76,0.08,2.84,0.13,0.19 +16,Koraput,14.09,12.40,1.81,2.07,2.08,0.34,2.42,0.35,0.52 +17,Malkangiri,6.31,5.55,0.81,0.93,1.78,0.04,1.82,0.89,1.33 +18,Nabarangpur,12.50,11.00,1.61,1.84,3.26,0.02,3.28,1.44,2.15 +19,Rayagada,9.83,8.65,1.26,1.44,1.15,0.03,1.18,-0.26,-0.39 +20,Mayurbhanj,25.61,22.54,3.29,3.76,4.90,0.06,4.96,1.20,1.79 +21,Kandhamal,7.45,6.56,0.96,1.10,0.70,0.01,0.71,-0.39,-0.58 +22,Boudh,4.51,3.97,0.58,0.66,1.73,0.03,1.76,1.10,1.64 +23,Puri,17.29,15.22,2.22,2.54,2.45,0.99,3.44,0.90,1.34 +24,Khordha,23.08,20.31,2.97,3.39,2.02,0.03,2.05,-1.34,-2.00 +25,Nayagarh,9.78,8.61,1.26,1.44,2.10,0.00,2.10,0.66,0.99 +26,Sambalpur,10.62,9.35,1.37,1.57,3.45,0.71,4.16,2.59,3.87 +27,Bargarh,15.00,13.20,1.93,2.21,6.87,2.65,9.52,7.31,10.91 +28,Deogarh,3.18,2.80,0.41,0.47,1.12,0.07,1.19,0.72,1.07 +29,Jharsuguda,5.91,5.20,0.76,0.87,0.99,0.01,1.00,0.13,0.19 +30,Sundargarh,21.21,18.66,2.72,3.11,4.72,0.02,4.74,1.63,2.43 diff --git a/docs/benchmark/lattice/agstat/agstat-table-detection-camelot.png b/docs/benchmark/lattice/agstat/agstat-table-detection-camelot.png new file mode 100755 index 0000000..9752a1b Binary files /dev/null and b/docs/benchmark/lattice/agstat/agstat-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/agstat/agstat-table-detection-tabula.png b/docs/benchmark/lattice/agstat/agstat-table-detection-tabula.png new file mode 100755 index 0000000..815e81b Binary files /dev/null and b/docs/benchmark/lattice/agstat/agstat-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/agstat/agstat.pdf b/docs/benchmark/lattice/agstat/agstat.pdf new file mode 100755 index 0000000..cf1c25a Binary files /dev/null and b/docs/benchmark/lattice/agstat/agstat.pdf differ diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..a02266c --- /dev/null +++ b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-1.csv @@ -0,0 +1,3 @@ +"The Red Ribbon Express (RRE) is the world’s largest mass mobilisation drive on HIV/AIDS. The train will travel through 22 states, during its one year long journey, halting at 152 stations. Through the RRE, NACO, intends to break the silence surrounding the issue of HIV/AIDS, by taking the messages on prevention, care and support to people living in small towns and villages across the country. The aim is also to create an environment, free from stigma and discrimination faced by people living with HIV, so they can access the services, without fear and prejudice, and live a life of dignity. It has proved to be a successful multi- sectoral initiative, of the NACO and a powerful advocacy tool, both at the state and district level, besides enhancing local capacity to deal with HIV prevention.","","" +"","","" +"","","" diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-2.csv b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-2.csv new file mode 100755 index 0000000..274bd62 --- /dev/null +++ b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-camelot-page-1-table-2.csv @@ -0,0 +1,8 @@ +"State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" +"Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000" +"Rajasthan","2.12.2009 to 19.12.2009","","","","","","" +"Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453" +"Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153" +"Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183" +"Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855" +"Total","","47","92","11.81","22,455","19,584","10,644" diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1-data-tabula.csv b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-tabula.csv new file mode 100755 index 0000000..5dfd559 --- /dev/null +++ b/docs/benchmark/lattice/background_lines_1/background_lines_1-data-tabula.csv @@ -0,0 +1,8 @@ +State,Date,"Halt stations","Halt days","Persons directly reached (in lakh)","Persons trained","Persons counseled","Persons tested for HIV" +Delhi,1.12.2009,8,17,1.29,"3,665","2,409","1,000" +Rajasthan,"2.12.2009 to 19.12.2009",,,,,, +Gujarat,"20.12.2009 to 3.1.2010",6,13,6.03,"3,810","2,317","1,453" +Maharashtra,"4.01.2010 to 1.2.2010",13,26,1.27,"5,680","9,027","4,153" +Karnataka,"2.2.2010 to 22.2.2010",11,19,1.80,"5,741","3,658","3,183" +Kerala,"23.2.2010 to 11.3.2010",9,17,1.42,"3,559","2,173",855 +Total,,47,92,11.81,"22,455","19,584","10,644" diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-camelot.png b/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-camelot.png new file mode 100755 index 0000000..676bd76 Binary files /dev/null and b/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-tabula.png b/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-tabula.png new file mode 100755 index 0000000..01adffc Binary files /dev/null and b/docs/benchmark/lattice/background_lines_1/background_lines_1-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/background_lines_1/background_lines_1.pdf b/docs/benchmark/lattice/background_lines_1/background_lines_1.pdf new file mode 100755 index 0000000..f23d6b7 Binary files /dev/null and b/docs/benchmark/lattice/background_lines_1/background_lines_1.pdf differ diff --git a/docs/benchmark/lattice/background_lines_2/background_lines_2-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/background_lines_2/background_lines_2-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..fa9c68c --- /dev/null +++ b/docs/benchmark/lattice/background_lines_2/background_lines_2-data-camelot-page-1-table-1.csv @@ -0,0 +1,27 @@ +"Sl.No","SIZE GROUP(HA)","NO. OF HOLDINGS GROWING THE CROP","","AREA UNDER THE CROP","","","AREA UNDER THE CROP TREATED WITHTHE MANURE","","" +"","","TOTAL NO.","NO. TREATED WITHTHE MANURE","HYV","OTHERS","TOTAL","HYV","OTHERS","TOTAL" +"(1)","(2)","(3)","(4)","(5)","(6)","(7)","(8)","(9)","(10)" +"1","MARGINAL (BELOW 1.0)","","","","","","","","" +"","I","39053","0","12142","3322","15464","0","0","0" +"","UI","7429","0","2088","1560","3648","0","0","0" +"","T","46484","0","14230","4882","19112","0","0","0" +"2","SMALL (1.0 - 1.99)","","","","","","","","" +"","I","20341","0","16685","1631","18316","0","0","0" +"","UI","6854","0","4594","1885","6479","0","0","0" +"","T","27197","0","21279","3516","24795","0","0","0" +"3","SEMI-MEDIUM (2.0 - 3.99)","","","","","","","","" +"","I","20800","0","16991","7643","24634","0","0","0" +"","UI","5856","0","1017","4819","5836","0","0","0" +"","T","26555","0","18008","12462","30470","0","0","0" +"4","MEDIUM (4.0 - 9.99)","","","","","","","","" +"","I","11986","0","17576","4120","21696","0","0","0" +"","UI","4615","0","1446","6227","7673","0","0","0" +"","T","16312","0","19022","10347","29369","0","0","0" +"5","LARGE (10 AND ABOVE)","","","","","","","","" +"","I","2005","0","3671","639","4310","0","0","0" +"","UI","521","0","611","831","1442","0","0","0" +"","T","2485","0","4282","1470","5752","0","0","0" +"6","ALL GROUPS","","","","","","","","" +"","I","94185","0","67065","17355","84420","0","0","0" +"","UI","25275","0","9756","15322","25078","0","0","0" +"","T","119033","0","76821","32677","109498","0","0","0" diff --git a/docs/benchmark/lattice/background_lines_2/background_lines_2-data-tabula.csv b/docs/benchmark/lattice/background_lines_2/background_lines_2-data-tabula.csv new file mode 100755 index 0000000..47ab20e --- /dev/null +++ b/docs/benchmark/lattice/background_lines_2/background_lines_2-data-tabula.csv @@ -0,0 +1,29 @@ +Sl.No,"SIZE GROUP (HA)","NO. OF HOLDINGS GROWING THE CROP TOTAL NO.NO. TREATED WITH THE MANURE",AREA UNDER THE CROP,"AREA UNDER THE CROP TREATED WITH THE MANURE",,,,, +"",,"NO. TREATED WITH THE MANURE",HYV,OTHERS,TOTAL,HYV,OTHERS,TOTAL, +"(1)",(2),(3),(4),(5),(6),(7),(8),(9),(10) +1,ARGINAL (BELOW 1.0),,,,,,,, +"","I UI T",39053,0,12142,332,15464,0,0,0 +"",7429,0,2088,1560,3648,0,0,0, +"",46484,0,14230,488,19112,0,0,0, +2,MALL (1.0 - 1.99),,,,,,,, +"","I UI T",20341,0,16685,163,18316,0,0,0 +"",6854,0,4594,1885,6479,0,0,0, +"",27197,0,21279,351,24795,0,0,0, +3,EMI-MEDIUM (2.0 - 3.99),,,,,,,, +"","I UI T",20800,0,16991,764,24634,0,0,0 +"",5856,0,1017,4819,5836,0,0,0, +"",26555,0,18008,1246,30470,0,0,0, +4,EDIUM (4.0 - 9.99),,,,,,,, +"","I UI T",11986,0,17576,412,21696,0,0,0 +"",4615,0,1446,6227,7673,0,0,0, +"",16312,0,19022,1034,29369,0,0,0, +5,ARGE (10 AND ABOVE),,,,,,,, +"","I UI T",2005,0,3671,63,4310,0,0,0 +"",521,0,611,831,1442,0,0,0, +"",2485,0,4282,147,5752,0,0,0, +"",LL GROUPS,,,,,,,, +"",94185,0,67065,1735,84420,0,0,0, +6,,,,,,,,, +"NO. OF HOLDINGS GROWING THE CROP TOTAL NO.NO. TREATED WITH THE MANURE" +"NO. TREATED WITH THE MANURE" +6 diff --git a/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-camelot.png b/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-camelot.png new file mode 100755 index 0000000..67b905f Binary files /dev/null and b/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-tabula.png b/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-tabula.png new file mode 100755 index 0000000..51163ee Binary files /dev/null and b/docs/benchmark/lattice/background_lines_2/background_lines_2-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/background_lines_2/background_lines_2.pdf b/docs/benchmark/lattice/background_lines_2/background_lines_2.pdf new file mode 100755 index 0000000..b64b2f2 Binary files /dev/null and b/docs/benchmark/lattice/background_lines_2/background_lines_2.pdf differ diff --git a/docs/benchmark/lattice/column_span_1/column_span_1-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/column_span_1/column_span_1-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..3e4668e --- /dev/null +++ b/docs/benchmark/lattice/column_span_1/column_span_1-data-camelot-page-1-table-1.csv @@ -0,0 +1,50 @@ +"Sl. No.","Year","Population (in Lakh)","Accidental Deaths","","Suicides","","Percentage Population growth" +"","","","Incidence","Rate","Incidence","Rate","" +"(1)","(2)","(3)","(4)","(5)","(6)","(7)","(8)" +"1.","1967","4999","126762","25.4","38829","7.8","2.2" +"2.","1968","5111","126232","24.7","40688","8.0","2.2" +"3.","1969","5225","130755","25.0","43633","8.4","2.2" +"4.","1970","5343","139752","26.2","48428","9.1","2.3" +"5.","1971","5512","105601","19.2","43675","7.9","3.2" +"6.","1972","5635","106184","18.8","43601","7.7","2.2" +"7.","1973","5759","130654","22.7","40807","7.1","2.2" +"8.","1974","5883","110624","18.8","46008","7.8","2.2" +"9.","1975","6008","113016","18.8","42890","7.1","2.1" +"10.","1976","6136","111611","18.2","41415","6.7","2.1" +"11.","1977","6258","117338","18.8","39718","6.3","2.0" +"12.","1978","6384","118594","18.6","40207","6.3","2.0" +"13.","1979","6510","108987","16.7","38217","5.9","2.0" +"14.","1980","6636","116912","17.6","41663","6.3","1.9" +"15.","1981","6840","122221","17.9","40245","5.9","3.1" +"16.","1982","7052","125993","17.9","44732","6.3","3.1" +"17.","1983","7204","128576","17.8","46579","6.5","2.2" +"18.","1984","7356","134628","18.3","50571","6.9","2.1" +"19.","1985","7509","139657","18.6","52811","7.0","2.1" +"20.","1986","7661","147023","19.2","54357","7.1","2.0" +"21.","1987","7814","152314","19.5","58568","7.5","2.0" +"22.","1988","7966","163522","20.5","64270","8.1","1.9" +"23.","1989","8118","169066","20.8","68744","8.5","1.9" +"24.","1990","8270","174401","21.1","73911","8.9","1.9" +"25.","1991","8496","188003","22.1","78450","9.2","2.7" +"26.","1992","8677","194910","22.5","80149","9.2","2.1" +"27.","1993","8838","192357","21.8","84244","9.5","1.9" +"28.","1994","8997","190435","21.2","89195","9.9","1.8" +"29.","1995","9160","222487","24.3","89178","9.7","1.8" +"30.","1996","9319","220094","23.6","88241","9.5","1.7" +"31.","1997","9552","233903","24.5","95829","10.0","2.5" +"32.","1998","9709","258409","26.6","104713","10.8","1.6" +"33.","1999","9866","271918","27.6","110587","11.2","1.6" +"34.","2000","10021","255883","25.5","108593","10.8","1.6" +"35.","2001","10270","271019","26.4","108506","10.6","2.5" +"36.","2002","10506","260122","24.8","110417","10.5","2.3" +"37.","2003","10682","259625","24.3","110851","10.4","1.7" +"38.","2004","10856","277263","25.5","113697","10.5","1.6" +"39.","2005","11028","294175","26.7","113914","10.3","1.6" +"40.","2006","11198","314704","28.1","118112","10.5","1.5" +"41.","2007","11366","340794","30.0","122637","10.8","1.5" +"42.","2008","11531","342309","29.7","125017","10.8","1.4" +"43.","2009","11694","357021","30.5","127151","10.9","1.4" +"44.","2010","11858","384649","32.4","134599","11.4","1.4" +"45.","2011","12102","390884","32.3","135585","11.2","2.1" +"46.","2012","12134","394982","32.6","135445","11.2","1.0" +"47.","2013","12288","400517","32.6","134799","11.0","1.0" diff --git a/docs/benchmark/lattice/column_span_1/column_span_1-data-tabula.csv b/docs/benchmark/lattice/column_span_1/column_span_1-data-tabula.csv new file mode 100755 index 0000000..b2f9ce7 --- /dev/null +++ b/docs/benchmark/lattice/column_span_1/column_span_1-data-tabula.csv @@ -0,0 +1,50 @@ +Rate of Accidental Deaths & Suicides and Population Growth During 1967 to 2013,,,,,,, +"Sl. No.",Year,"Population (in Lakh)",Accidental Deaths,Suicides,"Percentage Population growth",, +"",,,Incidence,Rate,Incidence,Rate, +"(1)",(2),(3),(4),(5),(6),(7),(8) +1.,1967,4999,126762,25.4,38829,7.8,2.2 +2.,1968,5111,126232,24.7,40688,8.0,2.2 +3.,1969,5225,130755,25.0,43633,8.4,2.2 +4.,1970,5343,139752,26.2,48428,9.1,2.3 +5.,1971,5512,105601,19.2,43675,7.9,3.2 +6.,1972,5635,106184,18.8,43601,7.7,2.2 +7.,1973,5759,130654,22.7,40807,7.1,2.2 +8.,1974,5883,110624,18.8,46008,7.8,2.2 +9.,1975,6008,113016,18.8,42890,7.1,2.1 +10.,1976,6136,111611,18.2,41415,6.7,2.1 +11.,1977,6258,117338,18.8,39718,6.3,2.0 +12.,1978,6384,118594,18.6,40207,6.3,2.0 +13.,1979,6510,108987,16.7,38217,5.9,2.0 +14.,1980,6636,116912,17.6,41663,6.3,1.9 +15.,1981,6840,122221,17.9,40245,5.9,3.1 +16.,1982,7052,125993,17.9,44732,6.3,3.1 +17.,1983,7204,128576,17.8,46579,6.5,2.2 +18.,1984,7356,134628,18.3,50571,6.9,2.1 +19.,1985,7509,139657,18.6,52811,7.0,2.1 +20.,1986,7661,147023,19.2,54357,7.1,2.0 +21.,1987,7814,152314,19.5,58568,7.5,2.0 +22.,1988,7966,163522,20.5,64270,8.1,1.9 +23.,1989,8118,169066,20.8,68744,8.5,1.9 +24.,1990,8270,174401,21.1,73911,8.9,1.9 +25.,1991,8496,188003,22.1,78450,9.2,2.7 +26.,1992,8677,194910,22.5,80149,9.2,2.1 +27.,1993,8838,192357,21.8,84244,9.5,1.9 +28.,1994,8997,190435,21.2,89195,9.9,1.8 +29.,1995,9160,222487,24.3,89178,9.7,1.8 +30.,1996,9319,220094,23.6,88241,9.5,1.7 +31.,1997,9552,233903,24.5,95829,10.0,2.5 +32.,1998,9709,258409,26.6,104713,10.8,1.6 +33.,1999,9866,271918,27.6,110587,11.2,1.6 +34.,2000,10021,255883,25.5,108593,10.8,1.6 +35.,2001,10270,271019,26.4,108506,10.6,2.5 +36.,2002,10506,260122,24.8,110417,10.5,2.3 +37.,2003,10682,259625,24.3,110851,10.4,1.7 +38.,2004,10856,277263,25.5,113697,10.5,1.6 +39.,2005,11028,294175,26.7,113914,10.3,1.6 +40.,2006,11198,314704,28.1,118112,10.5,1.5 +41.,2007,11366,340794,30.0,122637,10.8,1.5 +42.,2008,11531,342309,29.7,125017,10.8,1.4 +43.,2009,11694,357021,30.5,127151,10.9,1.4 +44.,2010,11858,384649,32.4,134599,11.4,1.4 +45.,2011,12102,390884,32.3,135585,11.2,2.1 +46.,2012,12134,394982,32.6,135445,11.2,1.0 diff --git a/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-camelot.png b/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-camelot.png new file mode 100755 index 0000000..c702a72 Binary files /dev/null and b/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-tabula.png b/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-tabula.png new file mode 100755 index 0000000..f297f1f Binary files /dev/null and b/docs/benchmark/lattice/column_span_1/column_span_1-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/column_span_1/column_span_1.pdf b/docs/benchmark/lattice/column_span_1/column_span_1.pdf new file mode 100755 index 0000000..e7c164e Binary files /dev/null and b/docs/benchmark/lattice/column_span_1/column_span_1.pdf differ diff --git a/docs/benchmark/lattice/column_span_2/column_span_2-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/column_span_2/column_span_2-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..3070f88 --- /dev/null +++ b/docs/benchmark/lattice/column_span_2/column_span_2-data-camelot-page-1-table-1.csv @@ -0,0 +1,11 @@ +"Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" +"Anthropometry","2400","All the available individuals","","","","" +"Clinical Examination","","","","","","" +"History of morbidity","","","","","","" +"Diet survey","1200","All the individuals partaking meals in the HH","","","","" +"Blood Pressure #","2400","Men (≥ 18yrs)","10%","95%","20%","1728" +"","","Women (≥ 18 yrs)","","","","1728" +"Fasting blood glucose","2400","Men (≥ 18 yrs)","5%","95%","20%","1825" +"","","Women (≥ 18 yrs)","","","","1825" +"Knowledge &Practices on HTN &DM","2400","Men (≥ 18 yrs)","-","-","-","1728" +"","2400","Women (≥ 18 yrs)","-","-","-","1728" diff --git a/docs/benchmark/lattice/column_span_2/column_span_2-data-tabula.csv b/docs/benchmark/lattice/column_span_2/column_span_2-data-tabula.csv new file mode 100755 index 0000000..9f4c6e1 --- /dev/null +++ b/docs/benchmark/lattice/column_span_2/column_span_2-data-tabula.csv @@ -0,0 +1,10 @@ +Investigations,"No. of HHs","Age/Sex/ Physiological Group","Preva- lence",C.I*,"Relative Precision","Sample size per State" +Anthropometry,2400,All the available individuals,,,, +Clinical Examination,,,,,, +History of morbidity,,,,,, +Diet survey,1200,All the individuals partaking meals in the HH,,,, +Blood Pressure #,2400,Men (≥ 18yrs),10%,95%,20%,1728 +"",,Women (≥ 18 yrs),1728,,, +Fasting blood glucose,2400,Men (≥ 18 yrs),5%,95%,20%,1825 +"",Women (≥ 18 yrs),1825,,,, +2400,Men (≥ 18 yrs),-,-,-,1728, diff --git a/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-camelot.png b/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-camelot.png new file mode 100755 index 0000000..1504bbf Binary files /dev/null and b/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-tabula.png b/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-tabula.png new file mode 100755 index 0000000..a375cc7 Binary files /dev/null and b/docs/benchmark/lattice/column_span_2/column_span_2-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/column_span_2/column_span_2.pdf b/docs/benchmark/lattice/column_span_2/column_span_2.pdf new file mode 100755 index 0000000..5cab903 Binary files /dev/null and b/docs/benchmark/lattice/column_span_2/column_span_2.pdf differ diff --git a/docs/benchmark/lattice/electoral_roll/electoral_roll-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/electoral_roll/electoral_roll-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..a4faff0 --- /dev/null +++ b/docs/benchmark/lattice/electoral_roll/electoral_roll-data-camelot-page-1-table-1.csv @@ -0,0 +1,32 @@ +"Section No & Name -1-DG-3, DDA FLATS VIKAS PURI ,DELHI","","","","","","","","","","" +"","1 NEL2976271","","","2 NEL3800892","","","3 NEL3767760","","","" +"","Mukesh KumarName :Father's :Lok Nath JhambName0House No :Age : 55Sex : Male","","","Dharamveer NandaName :Father's :Madan LalName15House No :Age : 63Sex : Male","","","Amit NandaName :Father's :Dharamvir NandaName15House No :Age : 33Sex : Male","","","" +"","","","","","","","","","","" +"","4 NEL3559068","","","5 NEL3905551","","","6 NEL3827317","","","" +"","PoojaName :Father's :P D JoshiName31House No :Age : 23Sex : Female","","","Aashray DuttaName :Father's :Ajay DuttaName48House No :Age : 22Sex : Male","","","Pushpjeet Kaur SinghName :Husband's :Baljit SinghName55House No :Age : 62Sex : Female","","","" +"","","","","","","","","","","" +"","7 NEL3475357","","","8 NEL3896791","","","9 NEL3896627","","","" +"","LakshmiName :Husband's :Lt KrishnanName62House No :Age : 80Sex : Female","","","Davinder KumarName :Father's :Ram RattanName63House No :Age : 61Sex : Male","","","DeepaName :Husband's :Davinder KumarName63House No :Age : 54Sex : Female","","","" +"","","","","","","","","","","" +"","10 NEL3784815","","","11 NEL3244199","","","12 LQK1856012","","","" +"","Tavishi DuttName :Father's :Anil DuttName73House No :Age : 20Sex : Female","","","Devinder VermaName :Father's :Prem Singh VermaName75House No :Age : 49Sex : Male","","","Sheetal BansalName :Father's :Vijay SharmaName88House No :Age : 41Sex : Female","","","" +"","","","","","","","","","","" +"","13 NEL3842365","","","14 NEL2973293","","","15 NEL2950060","","","" +"","Joyti AdhkariName :Husband's :Rakesh AdhikariName92House No :Age : 28Sex : Female","","","RituName :Husband's :Rajesh KumarName104House No :Age : 45Sex : Female","","","Rajesh KumarName :Father's :Rai Bahadur JunejaName104House No :Age : 43Sex : Male","","","" +"","","","","","","","","","","" +"","16 NEL2971959","","","17 NEL3843173","","","18 NEL4068250","","","" +"","ParulName :Father's :Rajesh KumarName104House No :Age : 23Sex : Female","","","Sushila GudvenaName :Father's :A Chayya MysaName119-BHouse No :Age : 46Sex : Female","","","DrishtiName :Father's :PramodhName126House No :Age : 22Sex : Female","","","" +"","","","","","","","","","","" +"","19 NEL3817466","","","20 NEL3834049","","","21 NEL3247474","","","" +"","AnkitName :Father's :Dinesh KumarName133House No :Age : 29Sex : Male","","","Vibhu NandaName :Father's :Pradeep Kumar NandaName143House No :Age : 25Sex : Male","","","Mool ChandName :Father's :Ghisa RamName145House No :Age : 47Sex : Male","","","" +"","","","","","","","","","","" +"","22 NEL3710472","","","23 NEL3722823","","","24 NEL3801163","","","" +"","Meha Elizabeth VargheseName :Father's :Varughese MathewName147House No :Age : 20Sex : Female","","","Mohit ChadhaName :Mother's :Promila ChadhaName151House No :Age : 24Sex : Male","","","Sakshi RanaName :Father's :Surinder RanaName155House No :Age : 21Sex : Female","","","" +"","","","","","","","","","","" +"","25 NEL3654752","","","26 NEL3714978","","","27 NEL2926482","","","" +"","Devender KumarName :Father's :Om Prakash RustagiName178House No :Age : 41Sex : Male","","","Shuchi RustagiName :Husband's :Devender KumarName178House No :Age : 35Sex : Female","","","Narayanan T PName :Father's :Sekharan NairName194House No :Age : 59Sex : Male","","","" +"","","","","","","","","","","" +"","28 NEL4131230","","","29 NEL3379277","","","30 NEL3379433","","","" +"","Papiya BoseName :Husband's :Vipin KumarName195House No :Age : 33Sex : Female","","","Depali ChaudharyName :Husband's :Nitin KumarName195House No :Age : 28Sex : Female","","","Arvind VermaName :Father's :Pr VermaName201House No :Age : 45Sex : Male","","","" +"","","","","","","","","","","" +"","","","","","","","","","","" diff --git a/docs/benchmark/lattice/electoral_roll/electoral_roll-data-tabula.csv b/docs/benchmark/lattice/electoral_roll/electoral_roll-data-tabula.csv new file mode 100755 index 0000000..7d23586 --- /dev/null +++ b/docs/benchmark/lattice/electoral_roll/electoral_roll-data-tabula.csv @@ -0,0 +1,56 @@ +2 +3 +"" +"" +"" +4 +5 +6 +"" +"" +"" +7 +8 +9 +"" +"" +"" +10 +11 +12 +"" +"" +"" +13 +14 +15 +"" +"" +"" +16 +17 +18 +"" +"" +"" +19 +20 +21 +"" +"" +"" +22 +23 +24 +"" +"" +"" +25 +26 +27 +"" +"" +"" +28 +29 +30 diff --git a/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-camelot.png b/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-camelot.png new file mode 100755 index 0000000..41f1017 Binary files /dev/null and b/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-tabula.png b/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-tabula.png new file mode 100755 index 0000000..e185303 Binary files /dev/null and b/docs/benchmark/lattice/electoral_roll/electoral_roll-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/electoral_roll/electoral_roll.pdf b/docs/benchmark/lattice/electoral_roll/electoral_roll.pdf new file mode 100755 index 0000000..e7547ef Binary files /dev/null and b/docs/benchmark/lattice/electoral_roll/electoral_roll.pdf differ diff --git a/docs/benchmark/lattice/rotated/rotated-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/rotated/rotated-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..fc99bdb --- /dev/null +++ b/docs/benchmark/lattice/rotated/rotated-data-camelot-page-1-table-1.csv @@ -0,0 +1,13 @@ +"State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)","" +"","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women" +"Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391" +"Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739" +"Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028" +"Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529" +"Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599" +"Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503" +"Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709" +"Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628" +"West Bengal","*","*","*","8047","423","2058","2743","1413","2027" +"Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366" +"Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519" diff --git a/docs/benchmark/lattice/rotated/rotated-data-tabula.csv b/docs/benchmark/lattice/rotated/rotated-data-tabula.csv new file mode 100755 index 0000000..3809282 --- /dev/null +++ b/docs/benchmark/lattice/rotated/rotated-data-tabula.csv @@ -0,0 +1,9 @@ +"State la il Nadu",,,T,ble,"1 : Nut (","TA itio o.","E al f in","IS ss ivi","CO ss ual","ER ent )",G,PA +"",,,"197 57 73","-79 8 7","1 1","88-9 633 21",,"996 88 58","97 4 3",,"011 829 785",12 +"ataka hra Prad arashtra rat hya Pra",sh,,"64 58","3 4",,"138 920",,"126 95","6 5",,"895 830", +"",es,,"44 *",3,,"374 *",,"48 *",6,,"964 794", +"sa t Bengal r Pradesh led * Data not",vail,"37 * * 38 ble","6 42",5,"540 * * 61",,"120 * * 606","4 1",,"847 804 986 68",8, +MB,,,,,,,,,,,, +"",,,,,,,,,,,, +"",,,,,,,,,,,, +"",,,,,,,,,,,, diff --git a/docs/benchmark/lattice/rotated/rotated-table-detection-camelot.png b/docs/benchmark/lattice/rotated/rotated-table-detection-camelot.png new file mode 100755 index 0000000..044721c Binary files /dev/null and b/docs/benchmark/lattice/rotated/rotated-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/rotated/rotated-table-detection-tabula.png b/docs/benchmark/lattice/rotated/rotated-table-detection-tabula.png new file mode 100755 index 0000000..ad9f042 Binary files /dev/null and b/docs/benchmark/lattice/rotated/rotated-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/rotated/rotated.pdf b/docs/benchmark/lattice/rotated/rotated.pdf new file mode 100755 index 0000000..8b7a615 Binary files /dev/null and b/docs/benchmark/lattice/rotated/rotated.pdf differ diff --git a/docs/benchmark/lattice/row_span/row_span-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/row_span/row_span-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..1d2df13 --- /dev/null +++ b/docs/benchmark/lattice/row_span/row_span-data-camelot-page-1-table-1.csv @@ -0,0 +1,40 @@ +"Plan Type","County","Plan Name","Totals" +"GMC","Sacramento","Anthem Blue Cross","164,380" +"","","Health Net","126,547" +"","","Kaiser Foundation","74,620" +"","","Molina Healthcare","59,989" +"","San Diego","Care 1st Health Plan","71,831" +"","","Community Health Group","264,639" +"","","Health Net","72,404" +"","","Kaiser","50,415" +"","","Molina Healthcare","206,430" +"","Total GMC Enrollment","","1,091,255" +"COHS","Marin","Partnership Health Plan of CA","36,006" +"","Mendocino","","37,243" +"","Napa","","28,398" +"","Solano","","113,220" +"","Sonoma","","112,271" +"","Yolo","","52,674" +"","Del Norte","","11,242" +"","Humboldt","","49,911" +"","Lake","","29,149" +"","Lassen","","7,360" +"","Modoc","","2,940" +"","Shasta","","61,763" +"","Siskiyou","","16,715" +"","Trinity","","4,542" +"","Merced","Central California Alliance for Health","123,907" +"","Monterey","","147,397" +"","Santa Cruz","","69,458" +"","Santa Barbara","CenCal","117,609" +"","San Luis Obispo","","55,761" +"","Orange","CalOptima","783,079" +"","San Mateo","Health Plan of San Mateo","113,202" +"","Ventura","Gold Coast Health Plan","202,217" +"","Total COHS Enrollment","","2,176,064" +"Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022" +"PCCM","Los Angeles","AIDS Healthcare Foundation","828" +"","San Francisco","Family Mosaic","25" +"","Total PHP Enrollment","","853" +"All Models Total Enrollments","","","10,132,875" +"Source: Data Warehouse 12/14/15","","","" diff --git a/docs/benchmark/lattice/row_span/row_span-data-tabula.csv b/docs/benchmark/lattice/row_span/row_span-data-tabula.csv new file mode 100755 index 0000000..b60bbcc --- /dev/null +++ b/docs/benchmark/lattice/row_span/row_span-data-tabula.csv @@ -0,0 +1,39 @@ +Plan Type,County,Plan Name,Totals +GMC,Sacramento,Anthem Blue Cross,"164,380" +"",,Health Net,"126,547" +"",,Kaiser Foundation,"74,620" +"",,Molina Healthcare,"59,989" +"",San Diego,Care 1st Health Plan,"71,831" +"",,Community Health Group,"264,639" +"",,Health Net,"72,404" +"",,Kaiser,"50,415" +"",,Molina Healthcare,"206,430" +"",Total GMC Enrollment,"1,091,255", +COHS,Marin,Partnership Health Plan of CA,"36,006" +"",Mendocino,"37,243", +"",Napa,"28,398", +"",Solano,"113,220", +"",Sonoma,"112,271", +"",Yolo,"52,674", +"",Del Norte,"11,242", +"",Humboldt,"49,911", +"",Lake,"29,149", +"",Lassen,"7,360", +"",Modoc,"2,940", +"",Shasta,"61,763", +"",Siskiyou,"16,715", +"",Trinity,"4,542", +"",Merced,Central California Alliance for Health,"123,907" +"",Monterey,"147,397", +"",Santa Cruz,"69,458", +"",Santa Barbara,CenCal,"117,609" +"",San Luis Obispo,"55,761", +"",Orange,CalOptima,"783,079" +"",San Mateo,Health Plan of San Mateo,"113,202" +"",Ventura,Gold Coast Health Plan,"202,217" +"",Total COHS Enrollment,"2,176,064", +"Subtotal for Two-Plan, Regional Model, GMC and COHS","10,132,022",, +PCCM,Los Angeles,AIDS Healthcare Foundation,828 +San Francisco,Family Mosaic,25, +Total PHP Enrollment,853,, +All Models Total Enrollments,"10,132,875",, diff --git a/docs/benchmark/lattice/row_span/row_span-table-detection-camelot.png b/docs/benchmark/lattice/row_span/row_span-table-detection-camelot.png new file mode 100755 index 0000000..ed1f215 Binary files /dev/null and b/docs/benchmark/lattice/row_span/row_span-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/row_span/row_span-table-detection-tabula.png b/docs/benchmark/lattice/row_span/row_span-table-detection-tabula.png new file mode 100755 index 0000000..529100e Binary files /dev/null and b/docs/benchmark/lattice/row_span/row_span-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/row_span/row_span.pdf b/docs/benchmark/lattice/row_span/row_span.pdf new file mode 100755 index 0000000..ef2c7ce Binary files /dev/null and b/docs/benchmark/lattice/row_span/row_span.pdf differ diff --git a/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..aea3c60 --- /dev/null +++ b/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-1.csv @@ -0,0 +1,3 @@ +"Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","Comments/ Action taken" +"","West Bengal","Bankura","xix. Acute Diarrhoeal Disease","46","0","10/11/13","15/11/13","Under Control","Cases of loose motion and vomiting reported from Village Daldali, SC Binodnagar, Block Onda, District Bankura. District RRT and Block health team investigated the outbreak. Active search for cases done. Cases treated at local health centre. 2 stool samples collected were negative for cholera culture. Out of 3 water samples collected 1 sample was non potable. Chlorination of water sources done. Health education given regarding safe drinking water and sanitation." +"","","","xx. Acute Diarrhoeal Disease","34","0","10/11/13","14/11/13","Under Control","Cases of loose motion and vomiting reported from Village Icharia, SC Mankhamar, Block Onda, District Bankura. District RRT and Block health team investigated the outbreak. House to house survey done. All cases treated locally. 4 stool samples collected were negative for cholera culture. Out of 34 water samples collected, 8 samples were non potable. Chlorination of water sources done. Health education given regarding safe drinking water and sanitation." diff --git a/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-2.csv b/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-2.csv new file mode 100755 index 0000000..3fa96e8 --- /dev/null +++ b/docs/benchmark/lattice/twotables_1/twotables_1-data-camelot-page-1-table-2.csv @@ -0,0 +1,5 @@ +"DISEASE OUTBREAKS OF PREVIOUS WEEKS REPORTED LATE","","","","","","","","" +"Sl. No","Name of State/UT","Name of District","Disease/ Illness","Cases","Deaths","Date of start of outbreak","Current Status","Comments/ Action taken" +"1","Bihar","Madhubani","xxi. Acute Diarrhoeal Disease","69","0","30/09/13","Under Control","Cases of diarrhoea and vomiting reported from Village Indra Mandal tola, SC Sugapatti, PHC Phulparas, District Madhubani. District RRT investigated the outbreak. House to house survey done. Medical camp organized. Symptomatic treatement given to cases. ORS packets and halogen tablets distributed in the community. Chlorination of water sources done. IEC done regarding safe drinking water and sanitation." +"","","Madhubani","xxii. Acute Diarrhoeal Disease","30","1","28/10/13","Under Control","Cases of diarrhoea and vomiting reported from Village/SC Rupauli, PHC Jhanjharpur, District Madhubani. District RRT and Block health team investigated the outbreak. Active search for cases done. Medical camp organized. All cases treated. One death occurred in 14yr old female child. ORS packets and halogen tablets distributed. Chlorination of water sources done. IEC activity regarding safe drinking water done." +"","","Katihar","xxiii. Acute Diarrhoeal Disease","13","3","24/10/13","Under control","Cases of diarrhoea reported from Village Ahuta, Alipur, SC Lachhor, PHC/Block Balrampur, District Katihar. District RRT investigated the outbreak. House to house survey done. Cases occurred in a Adiwasi community. 3 deaths occurred in 45yr, 35yr and 5yr old male. Temporary medical camps organized. All cases treated symptomatically. Chlorination of water sources done. Health education given." diff --git a/docs/benchmark/lattice/twotables_1/twotables_1-data-tabula.csv b/docs/benchmark/lattice/twotables_1/twotables_1-data-tabula.csv new file mode 100755 index 0000000..c361662 --- /dev/null +++ b/docs/benchmark/lattice/twotables_1/twotables_1-data-tabula.csv @@ -0,0 +1,63 @@ +"Sl. No.",Sl.,"Name of State/UT",Name of,Name of District,Name of District,Disease/ Illness,Disease/ Illness,,No.,,"No. of Deaths",No. of,,Date of,,"Date of reporting",Date of,"Current Status",Current,,Comments/ Action taken, +No.,State/UT,of,Death,start of,eportin,Status,,,,,,,,,,,,,,,, +"",,Cases,outbreak,,,,,,,,,,,,,,,,,,, +"","xix. Acute Diarrhoeal Disease",46,0,10/11/13,15/11/13,"Under Control","CasesofloosemotionandvomitingreportedfromVillage Daldali, SC Binodnagar, Block Onda, District Bankura. District RRT and Block health team investigated the outbreak. Active search for cases done. Cases treated at local health centre. 2 stool samples collected were negative for cholera culture. Out of 3 water samples collected 1 sample was non potable. Chlorination of water sources done. Health education given regarding safe drinking water and sanitation.",,,,,,,,,,,,,,, +"","ix. Acute",,,,,,,,,,,,,,,,,,,,, +"",Diarrhoeal,,,,,,,,,,,,,,,,,,,,, +"",Disease,,,,,,,,,,,,,,,,,,,,, +"",Bankura,,,,,,,,,,,,,,,,,,,,, +West Bengal,,,,,,,,,,,,,,,,,,,,,, +"",,,,,,,,,,,,,,,,,,,,,, +"","xx. Acute",,,,,,,,,,,,,,,,,,,,, +Bankura +"" +"xix. Acute Diarrhoeal Disease" +"ix. Acute" +Diarrhoeal +Disease +"xx. Acute" +Diarrhoeal +Disease +"Sl. No",Sl. +No, +"",DISEASE OUTBREAKS OF PREVIOUS WEEKS REPORTED LATE,,,,,,,,,,,,,,,,, +"Sl. No",Sl.,"Name of State/UT",Name of,Name of District,Name of District,Disease/ Illness,Disease/ Illness,Cases,ases,Deaths,eaths,,Date of,,"Current Status",Current,Comments/ Action taken,Comments/ Action taken +No,State/UT,start of,Status,,,,,,,,,,,,,,, +"",,,,,outbreak,,,,,,,,,,,,, +"",Madhubani,"xxi. Acute Diarrhoeal Disease",69,0,30/09/13,"Under Control","Cases of diarrhoea and vomiting reported from Village Indra Mandal tola, SC Sugapatti, PHC Phulparas, District Madhubani. District RRT investigated the outbreak.Housetohousesurveydone.Medicalcamporganized. Symptomatic treatement given to cases. ORS packets and halogen tablets distributed in the community. Chlorination of water sources done. IEC done regarding safe drinking water and sanitation.",,,,,,,,,,, +"",Madhubani,xxi. Acute,,,,,,,,,,,,,,,, +"",,Diarrhoeal,,,,,,,,,,,,,,,, +"",,Disease,,,,,,,,,,,,,,,, +"",Madhubani,"xxii. Acute Diarrhoeal Disease",30,1,28/10/13,"Under Control","Cases of diarrhoea and vomiting reported from Village/SC Rupauli, PHC Jhanjharpur,DistrictMadhubani.DistrictRRTandBlockhealthteam investigatedtheoutbreak.Activesearchforcasesdone.Medicalcamp organized. All cases treated. One death occurred in 14yr old female child. ORS packets and halogen tablets distributed. Chlorination of water sources done. IEC activity regarding safe drinking water done.",,,,,,,,,,, +"",Madhubani,xxii. Acute,,,,,,,,,,,,,,,, +Bihar,Diarrhoeal,,,,,,,,,,,,,,,,, +"",Disease,,,,,,,,,,,,,,,,, +"",,,,,,,,,,,,,,,,,, +"Name of State/UT",Name of +State/UT, +Name of District,Name of District +Disease/ Illness,Disease/ Illness +Cases,ases +Deaths,eaths +"Current Status",Current +Status, +Comments/ Action taken,Comments/ Action taken +Madhubani +Madhubani +"xxi. Acute Diarrhoeal Disease" +xxi. Acute +Diarrhoeal +Disease +"" +Bihar +"" +Madhubani +Madhubani +"xxii. Acute Diarrhoeal Disease" +xxii. Acute +Diarrhoeal +Disease +Katihar +xxiii. Acute +Diarrhoeal +Disease diff --git a/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-camelot.png b/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-camelot.png new file mode 100755 index 0000000..2ae4332 Binary files /dev/null and b/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-tabula.png b/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-tabula.png new file mode 100755 index 0000000..e000f9b Binary files /dev/null and b/docs/benchmark/lattice/twotables_1/twotables_1-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/twotables_1/twotables_1.pdf b/docs/benchmark/lattice/twotables_1/twotables_1.pdf new file mode 100755 index 0000000..cbbeeda Binary files /dev/null and b/docs/benchmark/lattice/twotables_1/twotables_1.pdf differ diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-1.csv b/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..a01d451 --- /dev/null +++ b/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-1.csv @@ -0,0 +1,13 @@ +"State","n","Literacy Status","","","","","" +"","","Illiterate","Read & Write","1-4 std.","5-8 std.","9-12 std.","College" +"Kerala","2400","7.2","0.5","25.3","20.1","41.5","5.5" +"Tamil Nadu","2400","21.4","2.3","8.8","35.5","25.8","6.2" +"Karnataka","2399","37.4","2.8","12.5","18.3","23.1","5.8" +"Andhra Pradesh","2400","54.0","1.7","8.4","13.2","18.8","3.9" +"Maharashtra","2400","22.0","0.9","17.3","20.3","32.6","7.0" +"Gujarat","2390","28.6","0.1","14.4","23.1","26.9","6.8" +"Madhya Pradesh","2402","29.1","3.4","8.5","35.1","13.3","10.6" +"Orissa","2405","33.2","1.0","10.4","25.7","21.2","8.5" +"West Bengal","2293","41.7","4.4","13.2","17.1","21.2","2.4" +"Uttar Pradesh","2400","35.3","2.1","4.5","23.3","27.1","7.6" +"Pooled","23889","30.9","1.9","12.3","23.2","25.2","6.4" diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-2.csv b/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-2.csv new file mode 100755 index 0000000..bf12e87 --- /dev/null +++ b/docs/benchmark/lattice/twotables_2/twotables_2-data-camelot-page-1-table-2.csv @@ -0,0 +1,13 @@ +"State","n","Literacy Status","","","","","" +"","","Illiterate","Read & Write","1-4 std.","5-8 std.","9-12 std.","College" +"Kerala","2400","8.8","0.3","20.1","17.0","45.6","8.2" +"Tamil Nadu","2400","29.9","1.5","8.5","33.1","22.3","4.8" +"Karnataka","2399","47.9","2.5","10.2","18.8","18.4","2.3" +"Andhra Pradesh","2400","66.4","0.7","6.8","12.9","11.4","1.8" +"Maharashtra","2400","41.3","0.6","14.1","20.1","21.6","2.2" +"Gujarat","2390","57.6","0.1","10.3","16.5","12.9","2.7" +"Madhya Pradesh","2402","58.7","2.2","6.6","24.1","5.3","3.0" +"Orissa","2405","50.0","0.9","8.1","21.9","15.1","4.0" +"West Bengal","2293","49.1","4.8","11.2","16.8","17.1","1.1" +"Uttar Pradesh","2400","67.3","2.0","3.1","17.2","7.7","2.7" +"Pooled","23889","47.7","1.5","9.9","19.9","17.8","3.3" diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-camelot.png b/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-camelot.png new file mode 100755 index 0000000..74b6e52 Binary files /dev/null and b/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-camelot.png differ diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-tabula.png b/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-tabula.png new file mode 100755 index 0000000..54eae27 Binary files /dev/null and b/docs/benchmark/lattice/twotables_2/twotables_2-table-detection-tabula.png differ diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-tabula-0.csv b/docs/benchmark/lattice/twotables_2/twotables_2-tabula-0.csv new file mode 100755 index 0000000..9b653c4 --- /dev/null +++ b/docs/benchmark/lattice/twotables_2/twotables_2-tabula-0.csv @@ -0,0 +1,13 @@ +"",,,Literacy Status, +State,n,Read & Illiterate CollegeWrite,1-4 std. 5-8 std. 9-12 std., +Kerala,2400,7.2 0.5,25.3 20.1 41.5,5.5 +Tamil Nadu,2400,21.4 2.3,8.8 35.5 25.8,6.2 +Karnataka,2399,37.4 2.8,12.5 18.3 23.1,5.8 +Andhra Pradesh,2400,54.0 1.7,8.4 13.2 18.8,3.9 +Maharashtra,2400,22.0 0.9,17.3 20.3 32.6,7.0 +Gujarat,2390,28.6 0.1,14.4 23.1 26.9,6.8 +Madhya Pradesh,2402,29.1 3.4,8.5 35.1 13.3,10.6 +Orissa,2405,33.2 1.0,10.4 25.7 21.2,8.5 +West Bengal,2293,41.7 4.4,13.2 17.1 21.2,2.4 +Uttar Pradesh,2400,35.3 2.1,4.5 23.3 27.1,7.6 +Pooled,23889,30.9 1.9,12.3 23.2 25.2,6.4 diff --git a/docs/benchmark/lattice/twotables_2/twotables_2-tabula-1.csv b/docs/benchmark/lattice/twotables_2/twotables_2-tabula-1.csv new file mode 100755 index 0000000..5322e31 --- /dev/null +++ b/docs/benchmark/lattice/twotables_2/twotables_2-tabula-1.csv @@ -0,0 +1,13 @@ +"",,,Literacy Status, +State,n,Read & Illiterate CollegeWrite,1-4 std. 5-8 std. 9-12 std., +Kerala,2400,8.8 0.3,20.1 17.0 45.6,8.2 +Tamil Nadu,2400,29.9 1.5,8.5 33.1 22.3,4.8 +Karnataka,2399,47.9 2.5,10.2 18.8 18.4,2.3 +Andhra Pradesh,2400,66.4 0.7,6.8 12.9 11.4,1.8 +Maharashtra,2400,41.3 0.6,14.1 20.1 21.6,2.2 +Gujarat,2390,57.6 0.1,10.3 16.5 12.9,2.7 +Madhya Pradesh,2402,58.7 2.2,6.6 24.1 5.3,3.0 +Orissa,2405,50.0 0.9,8.1 21.9 15.1,4.0 +West Bengal,2293,49.1 4.8,11.2 16.8 17.1,1.1 +Uttar Pradesh,2400,67.3 2.0,3.1 17.2 7.7,2.7 +Pooled,23889,47.7 1.5,9.9 19.9 17.8,3.3 diff --git a/docs/benchmark/lattice/twotables_2/twotables_2.pdf b/docs/benchmark/lattice/twotables_2/twotables_2.pdf new file mode 100755 index 0000000..5249887 Binary files /dev/null and b/docs/benchmark/lattice/twotables_2/twotables_2.pdf differ diff --git a/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..a8cffd6 --- /dev/null +++ b/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-1.csv @@ -0,0 +1,38 @@ +"","","Total","","","Male","","","Female","" +"Offense charged","","Under 18","18 years","","Under 18","18 years","","Under 18","18 years" +"","Total","years","and over","Total","years","and over","Total","years","and over" +"Total . . . . . . . . . . . . . . . . . . . . . . . . .","11,062 .6","1,540 .0","9,522 .6","8,263 .3","1,071 .6","7,191 .7","2,799 .2","468 .3","2,330 .9" +"Violent crime . . . . . . . . . . . . . . . . . .","467 .9","69 .1","398 .8","380 .2","56 .5","323 .7","87 .7","12 .6","75 .2" +"Murder and nonnegligent","","","","","","","","","" +"manslaughter . . . . . . . .. .. .. .. ..","10.0","0.9","9.1","9.0","0.9","8.1","1.1","–","1.0" +"Forcible rape . . . . . . . .. .. .. .. .. .","17.5","2.6","14.9","17.2","2.5","14.7","–","–","–" +"Robbery . . . .. .. . .. . ... . ... . ...","102.1","25.5","76.6","90.0","22.9","67.1","12.1","2.5","9.5" +"Aggravated assault . . . . . . . .. .. ..","338.4","40.1","298.3","264.0","30.2","233.8","74.4","9.9","64.5" +"Property crime . . . . . . . . . . . . . . . . .","1,396 .4","338 .7","1,057 .7","875 .9","210 .8","665 .1","608 .2","127 .9","392 .6" +"Burglary . .. . . . . .. ... .... .... ..","240.9","60.3","180.6","205.0","53.4","151.7","35.9","6.9","29.0" +"Larceny-theft . . . . . . . .. .. .. .. .. .","1,080.1","258.1","822.0","608.8","140.5","468.3","471.3","117.6","353.6" +"Motor vehicle theft . . . . .. .. . .... .","65.6","16.0","49.6","53.9","13.3","40.7","11.7","2.7","8.9" +"Arson .. . . . .. . ... .... .... .... .","9.8","4.3","5.5","8.1","3.7","4.4","1.7","0.6","1.1" +"Other assaults .. . . . . .. . ... . ... ..","1,061.3","175.3","886.1","785.4","115.4","670.0","276.0","59.9","216.1" +"Forgery and counterfeiting .. . . . . . ..","68.9","1.7","67.2","42.9","1.2","41.7","26.0","0.5","25.5" +"Fraud .... .. . . .. ... .... .... ....","173.7","5.1","168.5","98.4","3.3","95.0","75.3","1.8","73.5" +"Embezzlement . . .. . . . .. . ... . ....","14.6","–","14.1","7.2","–","6.9","7.4","–","7.2" +"Stolen property 1 . . . . . . .. . .. .. ...","84.3","15.1","69.2","66.7","12.2","54.5","17.6","2.8","14.7" +"Vandalism . . . . . . . .. .. .. .. .. ....","217.4","72.7","144.7","178.1","62.8","115.3","39.3","9.9","29.4" +"Weapons; carrying, possessing, etc. .","132.9","27.1","105.8","122.1","24.3","97.8","10.8","2.8","8.0" +"Prostitution and commercialized vice","56.9","1.1","55.8","17.3","–","17.1","39.6","0.8","38.7" +"Sex offenses 2 . . . . .. . . . .. .. .. . ..","61.5","10.7","50.7","56.1","9.6","46.5","5.4","1.1","4.3" +"Drug abuse violations . . . . . . . .. ...","1,333.0","136.6","1,196.4","1,084.3","115.2","969.1","248.7","21.4","227.3" +"Gambling .. . . . . .. ... . ... . ... ...","8.2","1.4","6.8","7.2","1.4","5.9","0.9","–","0.9" +"Offenses against the family and","","","","","","","","","" +"children . . . .. . . .. .. .. .. .. .. . ..","92.4","3.7","88.7","68.9","2.4","66.6","23.4","1.3","22.1" +"Driving under the influence . . . . . .. .","1,158.5","109.2","1,147.5","895.8","8.2","887.6","262.7","2.7","260.0" +"Liquor laws . . . . . . . .. .. .. .. .. .. .","48.2","90.2","368.0","326.8","55.4","271.4","131.4","34.7","96.6" +"Drunkenness . . .. . . . .. . ... . ... ..","488.1","11.4","476.8","406.8","8.5","398.3","81.3","2.9","78.4" +"Disorderly conduct . .. . . . . . .. .. .. .","529.5","136.1","393.3","387.1","90.8","296.2","142.4","45.3","97.1" +"Vagrancy . . . .. . . . ... .... .... ...","26.6","2.2","24.4","20.9","1.6","19.3","5.7","0.6","5.1" +"All other offenses (except traffic) . . ..","306.1","263.4","2,800.8","2,337.1","194.2","2,142.9","727.0","69.2","657.9" +"Suspicion . . . .. . . .. .. .. .. .. .. . ..","1.6","–","1.4","1.2","–","1.0","–","–","–" +"Curfew and loitering law violations ..","91.0","91.0","(X)","63.1","63.1","(X)","28.0","28.0","(X)" +"Runaways . . . . . . . .. .. .. .. .. ....","75.8","75.8","(X)","34.0","34.0","(X)","41.8","41.8","(X)" +"","– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.","","","","","","","","" diff --git a/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-2.csv b/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-2.csv new file mode 100755 index 0000000..b5b7f28 --- /dev/null +++ b/docs/benchmark/stream/12s0324/12s0324-data-camelot-page-1-table-2.csv @@ -0,0 +1,36 @@ +"","","","","American","" +"Offense charged","","","","Indian/Alaskan","Asian Pacific" +"","Total","White","Black","Native","Islander" +"Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .","10,690,561","7,389,208","3,027,153","150,544","123,656" +"Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .","456,965","268,346","177,766","5,608","5,245" +"Murder and nonnegligent manslaughter . .. ... .","9,739","4,741","4,801","100","97" +"Forcible rape . . . . . . . .. .. .. .. .... .. ...... .","16,362","10,644","5,319","169","230" +"Robbery . . . . .. . . . ... . ... . .... .... .... . . .","100,496","43,039","55,742","726","989" +"Aggravated assault . . . . . . . .. .. ...... .. ....","330,368","209,922","111,904","4,613","3,929" +"Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .","1,364,409","922,139","406,382","17,599","18,289" +"Burglary . . .. . . . .. . .... .... .... .... ... . . .","234,551","155,994","74,419","2,021","2,117" +"Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .","1,056,473","719,983","306,625","14,646","15,219" +"Motor vehicle theft . . . . . .. ... . ... ..... ... ..","63,919","39,077","23,184","817","841" +"Arson .. . . .. .. .. ... .... .... .... .... . . . . .","9,466","7,085","2,154","115","112" +"Other assaults .. . . . . . ... . ... . ... ..... ... ..","1,032,502","672,865","332,435","15,127","12,075" +"Forgery and counterfeiting .. . . . . . ... ..... .. ..","67,054","44,730","21,251","345","728" +"Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .","161,233","108,032","50,367","1,315","1,519" +"Embezzlement . . . .. . . . ... . ... . .... ... .....","13,960","9,208","4,429","75","248" +"Stolen property; buying, receiving, possessing .. .","82,714","51,953","29,357","662","742" +"Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .","212,173","157,723","48,746","3,352","2,352" +"Weapons—carrying, possessing, etc. .. .. ... .. .","130,503","74,942","53,441","951","1,169" +"Prostitution and commercialized vice . ... .. .. ..","56,560","31,699","23,021","427","1,413" +"Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .","60,175","44,240","14,347","715","873" +"Drug abuse violations . . . . . . . .. . ..... .. .....","1,301,629","845,974","437,623","8,588","9,444" +"Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .","8,046","2,290","5,518","27","211" +"Offenses against the family and children ... .. .. .","87,232","58,068","26,850","1,690","624" +"Driving under the influence . . . . . . .. ... ...... .","1,105,401","954,444","121,594","14,903","14,460" +"Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....","444,087","373,189","50,431","14,876","5,591" +"Drunkenness . .. . . . . . ... . ... . ..... . .......","469,958","387,542","71,020","8,552","2,844" +"Disorderly conduct . . .. . . . . .. .. . ..... .. .....","515,689","326,563","176,169","8,783","4,174" +"Vagrancy . . .. .. . . .. ... .... .... .... .... . . .","26,347","14,581","11,031","543","192" +"All other offenses (except traffic) . .. .. .. ..... ..","2,929,217","1,937,221","911,670","43,880","36,446" +"Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .","1,513","677","828","1","7" +"Curfew and loitering law violations . .. ... .. ....","89,578","54,439","33,207","872","1,060" +"Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .","73,616","48,343","19,670","1,653","3,950" +"1 Except forcible rape and prostitution.","","","","","" diff --git a/docs/benchmark/stream/12s0324/12s0324-data-tabula-0.csv b/docs/benchmark/stream/12s0324/12s0324-data-tabula-0.csv new file mode 100755 index 0000000..3de14fe --- /dev/null +++ b/docs/benchmark/stream/12s0324/12s0324-data-tabula-0.csv @@ -0,0 +1,37 @@ +"",,Total,,,Male,,,Female, +Offense charged,,Under 18,18 years,,Under 18,18 years,,Under 18,18 years +"",Total,years,and over,Total,years,and over,Total,years,and over +Total . . . . . . . . . . . . . . . . . . . . . . . . .,"11,062 .6","1,540 .0","9,522 .6","8,263 .3","1,071 .6","7,191 .7","2,799 .2",468 .3,"2,330 .9" +Violent crime . . . . . . . . . . . . . . . . . .,467 .9,69 .1,398 .8,380 .2,56 .5,323 .7,87 .7,12 .6,75 .2 +Murder and nonnegligent,,,,,,,,, +manslaughter . . . . . . . . . . . . . . . . .,10.0,0.9,9.1,9.0,0.9,8.1,1.1,–,1.0 +Forcible rape . . . . . . . . . . . . . . . . . .,17.5,2.6,14.9,17.2,2.5,14.7,–,–,– +Robbery . . . . . . . . . . . . . . . . . . . . . .,102.1,25.5,76.6,90.0,22.9,67.1,12.1,2.5,9.5 +Aggravated assault . . . . . . . . . . . . .,338.4,40.1,298.3,264.0,30.2,233.8,74.4,9.9,64.5 +Property crime . . . . . . . . . . . . . . . . .,"1,396 .4",338 .7,"1,057 .7",875 .9,210 .8,665 .1,608 .2,127 .9,392 .6 +Burglary . . . . . . . . . . . . . . . . . . . . . .,240.9,60.3,180.6,205.0,53.4,151.7,35.9,6.9,29.0 +Larceny-theft . . . . . . . . . . . . . . . . . .,"1,080.1",258.1,822.0,608.8,140.5,468.3,471.3,117.6,353.6 +Motor vehicle theft . . . . . . . . . . . . . .,65.6,16.0,49.6,53.9,13.3,40.7,11.7,2.7,8.9 +Arson . . . . . . . . . . . . . . . . . . . . . . . .,9.8,4.3,5.5,8.1,3.7,4.4,1.7,0.6,1.1 +Other assaults . . . . . . . . . . . . . . . . . .,"1,061.3",175.3,886.1,785.4,115.4,670.0,276.0,59.9,216.1 +Forgery and counterfeiting . . . . . . . . .,68.9,1.7,67.2,42.9,1.2,41.7,26.0,0.5,25.5 +Fraud . . . . . . . . . . . . . . . . . . . . . . . . .,173.7,5.1,168.5,98.4,3.3,95.0,75.3,1.8,73.5 +Embezzlement . . . . . . . . . . . . . . . . . .,14.6,–,14.1,7.2,–,6.9,7.4,–,7.2 +Stolen property 1 . . . . . . . . . . . . . . . .,84.3,15.1,69.2,66.7,12.2,54.5,17.6,2.8,14.7 +Vandalism . . . . . . . . . . . . . . . . . . . . .,217.4,72.7,144.7,178.1,62.8,115.3,39.3,9.9,29.4 +"Weapons; carrying, possessing, etc. .",132.9,27.1,105.8,122.1,24.3,97.8,10.8,2.8,8.0 +Prostitution and commercialized vice,56.9,1.1,55.8,17.3,–,17.1,39.6,0.8,38.7 +Sex offenses 2 . . . . . . . . . . . . . . . . . .,61.5,10.7,50.7,56.1,9.6,46.5,5.4,1.1,4.3 +Drug abuse violations . . . . . . . . . . . .,"1,333.0",136.6,"1,196.4","1,084.3",115.2,969.1,248.7,21.4,227.3 +Gambling . . . . . . . . . . . . . . . . . . . . . .,8.2,1.4,6.8,7.2,1.4,5.9,0.9,–,0.9 +Offenses against the family and,,,,,,,,, +children . . . . . . . . . . . . . . . . . . . . . .,92.4,3.7,88.7,68.9,2.4,66.6,23.4,1.3,22.1 +Driving under the influence . . . . . . . .,"1,158.5",109.2,"1,147.5",895.8,8.2,887.6,262.7,2.7,260.0 +Liquor laws . . . . . . . . . . . . . . . . . . . .,48.2,90.2,368.0,326.8,55.4,271.4,131.4,34.7,96.6 +Drunkenness . . . . . . . . . . . . . . . . . . .,488.1,11.4,476.8,406.8,8.5,398.3,81.3,2.9,78.4 +Disorderly conduct . . . . . . . . . . . . . . .,529.5,136.1,393.3,387.1,90.8,296.2,142.4,45.3,97.1 +Vagrancy . . . . . . . . . . . . . . . . . . . . . .,26.6,2.2,24.4,20.9,1.6,19.3,5.7,0.6,5.1 +All other offenses (except traffic) . . . .,306.1,263.4,"2,800.8","2,337.1",194.2,"2,142.9",727.0,69.2,657.9 +Suspicion . . . . . . . . . . . . . . . . . . . . . .,1.6,–,1.4,1.2,–,1.0,–,–,– +Curfew and loitering law violations . .,91.0,91.0,(X),63.1,63.1,(X),28.0,28.0,(X) +Runaways . . . . . . . . . . . . . . . . . . . . .,75.8,75.8,(X),34.0,34.0,(X),41.8,41.8,(X) diff --git a/docs/benchmark/stream/12s0324/12s0324-data-tabula-1.csv b/docs/benchmark/stream/12s0324/12s0324-data-tabula-1.csv new file mode 100755 index 0000000..f51232e --- /dev/null +++ b/docs/benchmark/stream/12s0324/12s0324-data-tabula-1.csv @@ -0,0 +1,35 @@ +"",,,,American, +Offense charged,,,,Indian/Alaskan,Asian Pacific +"",Total,White,Black,Native,Islander +Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"10,690,561","7,389,208","3,027,153","150,544","123,656" +Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"456,965","268,346","177,766","5,608","5,245" +Murder and nonnegligent manslaughter . . . . . . .,"9,739","4,741","4,801",100,97 +Forcible rape . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"16,362","10,644","5,319",169,230 +Robbery . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"100,496","43,039","55,742",726,989 +Aggravated assault . . . . . . . . . . . . . . . . . . . . . . .,"330,368","209,922","111,904","4,613","3,929" +Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .,"1,364,409","922,139","406,382","17,599","18,289" +Burglary . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"234,551","155,994","74,419","2,021","2,117" +Larceny-theft . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"1,056,473","719,983","306,625","14,646","15,219" +Motor vehicle theft . . . . . . . . . . . . . . . . . . . . . . . .,"63,919","39,077","23,184",817,841 +Arson . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"9,466","7,085","2,154",115,112 +Other assaults . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"1,032,502","672,865","332,435","15,127","12,075" +Forgery and counterfeiting . . . . . . . . . . . . . . . . . . .,"67,054","44,730","21,251",345,728 +Fraud . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"161,233","108,032","50,367","1,315","1,519" +Embezzlement . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"13,960","9,208","4,429",75,248 +"Stolen property; buying, receiving, possessing . . .","82,714","51,953","29,357",662,742 +Vandalism . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"212,173","157,723","48,746","3,352","2,352" +"Weapons—carrying, possessing, etc. . . . . . . . . . .","130,503","74,942","53,441",951,"1,169" +Prostitution and commercialized vice . . . . . . . . . .,"56,560","31,699","23,021",427,"1,413" +Sex offenses 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"60,175","44,240","14,347",715,873 +Drug abuse violations . . . . . . . . . . . . . . . . . . . . . .,"1,301,629","845,974","437,623","8,588","9,444" +Gambling . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"8,046","2,290","5,518",27,211 +Offenses against the family and children . . . . . . . .,"87,232","58,068","26,850","1,690",624 +Driving under the influence . . . . . . . . . . . . . . . . . .,"1,105,401","954,444","121,594","14,903","14,460" +Liquor laws . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"444,087","373,189","50,431","14,876","5,591" +Drunkenness . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"469,958","387,542","71,020","8,552","2,844" +Disorderly conduct . . . . . . . . . . . . . . . . . . . . . . . . .,"515,689","326,563","176,169","8,783","4,174" +Vagrancy . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"26,347","14,581","11,031",543,192 +All other offenses (except traffic) . . . . . . . . . . . . . .,"2,929,217","1,937,221","911,670","43,880","36,446" +Suspicion . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"1,513",677,828,1,7 +Curfew and loitering law violations . . . . . . . . . . . .,"89,578","54,439","33,207",872,"1,060" +Runaways . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .,"73,616","48,343","19,670","1,653","3,950" diff --git a/docs/benchmark/stream/12s0324/12s0324.pdf b/docs/benchmark/stream/12s0324/12s0324.pdf new file mode 100755 index 0000000..c192726 Binary files /dev/null and b/docs/benchmark/stream/12s0324/12s0324.pdf differ diff --git a/docs/benchmark/stream/birdisland/birdisland-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/birdisland/birdisland-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..a05fcaa --- /dev/null +++ b/docs/benchmark/stream/birdisland/birdisland-data-camelot-page-1-table-1.csv @@ -0,0 +1,35 @@ +"","","","","","SCN","Seed","Yield","Moisture","Lodgingg","g","Stand","","Gross" +"Company/Brandpy","","Product/Brand†","Technol.†","Mat.","Resist.","Trmt.†","Bu/A","%","%","","(x 1000)(",")","Income" +"KrugerKruger","","K2-1901K2 1901","RR2YRR2Y","1.91.9","RR","Ac,PVAc,PV","56.456.4","7.67.6","00","","126.3126.3","","$846$846" +"StineStine","","19RA02 §19RA02 §","RR2YRR2Y","1 91.9","RR","CMBCMB","55.355.3","7 67.6","00","","120 0120.0","","$830$830" +"WensmanWensman","","W 3190NR2W 3190NR2","RR2YRR2Y","1 91.9","RR","AcAc","54 554.5","7 67.6","00","","119 5119.5","","$818$818" +"H ftHefty","","H17Y12H17Y12","RR2YRR2Y","1 71.7","MRMR","II","53 753.7","7 77.7","00","","124 4124.4","","$806$806" +"Dyna-Gro","","S15RY53","RR2Y","1.5","R","Ac","53.6","7.7","0","","126.8","","$804" +"LG SeedsLG Seeds","","C2050R2C2050R2","RR2YRR2Y","2.12.1","RR","AcAc","53.653.6","7.77.7","00","","123.9123.9","","$804$804" +"Titan ProTitan Pro","","19M4219M42","RR2YRR2Y","1.91.9","RR","CMBCMB","53.653.6","7.77.7","00","","121.0121.0","","$804$804" +"StineStine","","19RA02 (2) §19RA02 (2) §","RR2YRR2Y","1 91.9","RR","CMBCMB","53 453.4","7 77.7","00","","123 9123.9","","$801$801" +"AsgrowAsgrow","","AG1832 §AG1832 §","RR2YRR2Y","1 81.8","MRMR","Ac PVAc,PV","52 952.9","7 77.7","00","","122 0122.0","","$794$794" +"Prairie Brandiid","","PB-1566R2662","RR2Y2","1.5","R","CMB","52.8","7.7","0","","122.9","","$792$" +"Channel","","1901R2","RR2Y","1.9","R","Ac,PV,","52.8","7.6","0","","123.4","","$791$" +"Titan ProTitan Pro","","20M120M1","RR2YRR2Y","2.02.0","RR","AmAm","52.552.5","7.57.5","00","","124.4124.4","","$788$788" +"KrugerKruger","","K2-2002K2-2002","RR2YRR2Y","2 02.0","RR","Ac PVAc,PV","52 452.4","7 97.9","00","","125 4125.4","","$786$786" +"ChannelChannel","","1700R21700R2","RR2YRR2Y","1 71.7","RR","Ac PVAc,PV","52 352.3","7 97.9","00","","123 9123.9","","$784$784" +"H ftHefty","","H16Y11H16Y11","RR2YRR2Y","1 61.6","MRMR","II","51 451.4","7 67.6","00","","123 9123.9","","$771$771" +"Anderson","","162R2Y","RR2Y","1.6","R","None","51.3","7.5","0","","119.5","","$770" +"Titan ProTitan Pro","","15M2215M22","RR2YRR2Y","1.51.5","RR","CMBCMB","51.351.3","7.87.8","00","","125.4125.4","","$769$769" +"DairylandDairyland","","DSR-1710R2YDSR-1710R2Y","RR2YRR2Y","1 71.7","RR","CMBCMB","51 351.3","7 77.7","00","","122 0122.0","","$769$769" +"HeftyHefty","","H20R3H20R3","RR2YRR2Y","2 02.0","MRMR","II","50 550.5","8 28.2","00","","121 0121.0","","$757$757" +"PPrairie BrandiiBd","","PB 1743R2PB-1743R2","RR2YRR2Y","1 71.7","RR","CMBCMB","50 250.2","7 77.7","00","","125 8125.8","","$752$752" +"Gold Country","","1741","RR2Y","1.7","R","Ac","50.1","7.8","0","","123.9","","$751" +"Trelaye ay","","20RR4303","RR2Y","2.00","R","Ac,Exc,","49.99 9","7.66","00","","127.88","","$749$9" +"HeftyHefty","","H14R3H14R3","RR2YRR2Y","1.41.4","MRMR","II","49.749.7","7.77.7","00","","122.9122.9","","$746$746" +"Prairie BrandPrairie Brand","","PB-2099NRR2PB-2099NRR2","RR2YRR2Y","2 02.0","RR","CMBCMB","49 649.6","7 87.8","00","","126 3126.3","","$743$743" +"WensmanWensman","","W 3174NR2W 3174NR2","RR2YRR2Y","1 71.7","RR","AcAc","49 349.3","7 67.6","00","","122 5122.5","","$740$740" +"KKruger","","K2 1602K2-1602","RR2YRR2Y","1 61.6","R","Ac,PV","48.78","7.66","00","","125.412","","$731$31" +"NK Brand","","S18-C2 §§","RR2Y","1.8","R","CMB","48.7","7.7","0","","126.8","","$731$" +"KrugerKruger","","K2-1902K2 1902","RR2YRR2Y","1.91.9","RR","Ac,PVAc,PV","48.748.7","7.57.5","00","","124.4124.4","","$730$730" +"Prairie BrandPrairie Brand","","PB-1823R2PB-1823R2","RR2YRR2Y","1 81.8","RR","NoneNone","48 548.5","7 67.6","00","","121 0121.0","","$727$727" +"Gold CountryGold Country","","15411541","RR2YRR2Y","1 51.5","RR","AcAc","48 448.4","7 67.6","00","","110 4110.4","","$726$726" +"","","","","","","Test Average =","47 647.6","7 77.7","00","","122 9122.9","","$713$713" +"","","","","","","LSD (0.10) =","5.7","0.3","ns","","37.8","","566.4" +"","F.I.R.S.T. Managerg","","","","","C.V. =","8.8","2.9","","","56.4","","846.2" diff --git a/docs/benchmark/stream/birdisland/birdisland-data-tabula.csv b/docs/benchmark/stream/birdisland/birdisland-data-tabula.csv new file mode 100755 index 0000000..1791ee9 --- /dev/null +++ b/docs/benchmark/stream/birdisland/birdisland-data-tabula.csv @@ -0,0 +1,30 @@ +Kruger,K2-1901,RR2Y,1.9,R,"Ac,PV",56.4,7.6,0,126.3,$846 +Stine,19RA02 §,RR2Y,1.9,R,CMB,55.3,7.6,0,120.0,$830 +Wensman,W 3190NR2,RR2Y,1.9,R,Ac,54.5,7.6,0,119.5,$818 +Hefty,H17Y12,RR2Y,1.7,MR,I,53.7,7.7,0,124.4,$806 +Dyna-Gro,S15RY53,RR2Y,1.5,R,Ac,53.6,7.7,0,126.8,$804 +LG Seeds,C2050R2,RR2Y,2.1,R,Ac,53.6,7.7,0,123.9,$804 +Titan Pro,19M42,RR2Y,1.9,R,CMB,53.6,7.7,0,121.0,$804 +Stine,19RA02 (2) §,RR2Y,1.9,R,CMB,53.4,7.7,0,123.9,$801 +Asgrow,AG1832 §,RR2Y,1.8,MR,"Ac,PV",52.9,7.7,0,122.0,$794 +Prairie Brand,PB-1566R2,RR2Y,1.5,R,CMB,52.8,7.7,0,122.9,$792 +Channel,1901R2,RR2Y,1.9,R,"Ac,PV",52.8,7.6,0,123.4,$791 +Titan Pro,20M1,RR2Y,2.0,R,Am,52.5,7.5,0,124.4,$788 +Kruger,K2-2002,RR2Y,2.0,R,"Ac,PV",52.4,7.9,0,125.4,$786 +Channel,1700R2,RR2Y,1.7,R,"Ac,PV",52.3,7.9,0,123.9,$784 +Hefty,H16Y11,RR2Y,1.6,MR,I,51.4,7.6,0,123.9,$771 +Anderson,162R2Y,RR2Y,1.6,R,None,51.3,7.5,0,119.5,$770 +Titan Pro,15M22,RR2Y,1.5,R,CMB,51.3,7.8,0,125.4,$769 +Dairyland,DSR-1710R2Y,RR2Y,1.7,R,CMB,51.3,7.7,0,122.0,$769 +Hefty,H20R3,RR2Y,2.0,MR,I,50.5,8.2,0,121.0,$757 +Prairie Brand,PB-1743R2,RR2Y,1.7,R,CMB,50.2,7.7,0,125.8,$752 +Gold Country,1741,RR2Y,1.7,R,Ac,50.1,7.8,0,123.9,$751 +Trelay,20RR43,RR2Y,2.0,R,"Ac,Ex",49.9,7.6,0,127.8,$749 +Hefty,H14R3,RR2Y,1.4,MR,I,49.7,7.7,0,122.9,$746 +Prairie Brand,PB-2099NRR2,RR2Y,2.0,R,CMB,49.6,7.8,0,126.3,$743 +Wensman,W 3174NR2,RR2Y,1.7,R,Ac,49.3,7.6,0,122.5,$740 +Kruger,K2-1602,RR2Y,1.6,R,"Ac,PV",48.7,7.6,0,125.4,$731 +NK Brand,S18-C2 §,RR2Y,1.8,R,CMB,48.7,7.7,0,126.8,$731 +Kruger,K2-1902,RR2Y,1.9,R,"Ac,PV",48.7,7.5,0,124.4,$730 +Prairie Brand,PB-1823R2,RR2Y,1.8,R,None,48.5,7.6,0,121.0,$727 +Gold Country,1541,RR2Y,1.5,R,Ac,48.4,7.6,0,110.4,$726 diff --git a/docs/benchmark/stream/birdisland/birdisland.pdf b/docs/benchmark/stream/birdisland/birdisland.pdf new file mode 100755 index 0000000..1501158 Binary files /dev/null and b/docs/benchmark/stream/birdisland/birdisland.pdf differ diff --git a/docs/benchmark/stream/budget/budget-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/budget/budget-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..7b49956 --- /dev/null +++ b/docs/benchmark/stream/budget/budget-data-camelot-page-1-table-1.csv @@ -0,0 +1,46 @@ +"","Budget Provisions, net of receipts and recoveries, for the year 2014-2015 are as under.","","","","","","","" +"","Further details are on pages indicated in the last column of this Statement","","","","","","","" +"Budget 2014-2015","","","","","","","","" +"","","","","","","","(In crores of Rupees)","" +"","","Plan","","","Non-Plan","","Total Plan &","" +"","","","","","","","Non-Plan","" +"Ministry/Demand","Revenue","Capital","Total","Revenue","Capital","Total","","Page No." +"MINISTRY OF AGRICULTURE","28130.48","67.52","28198.00","2863.09","1.85","2864.94","31062.94","" +"1. Department of Agriculture and Cooperation","22260.55","48.45","22309.00","342.51","0.74","343.25","22652.25","1-10" +"2. Department of Agricultural Research and Education","3715.00","...","3715.00","2429.39","...","2429.39","6144.39","11-13" +"3. Department of Animal Husbandry, Dairying and Fisheries","2154.93","19.07","2174.00","91.19","1.11","92.30","2266.30","14-19" +"DEPARTMENT OF ATOMIC ENERGY","1779.00","4101.00","5880.00","3710.84","855.75","4566.59","10446.59","" +"4. Atomic Energy","1483.00","3427.00","4910.00","2971.25","855.75","3827.00","8737.00","20-25" +"5. Nuclear Power Schemes","296.00","674.00","970.00","739.59","...","739.59","1709.59","26-27" +"MINISTRY OF CHEMICALS AND FERTILISERS","360.83","153.17","514.00","73104.46","0.09","73104.55","73618.55","" +"6. Department of Chemicals and Petrochemicals","171.49","35.51","207.00","63.67","0.01","63.68","270.68","28-30" +"7. Department of Fertilisers","12.34","87.66","100.00","72999.96","0.04","73000.00","73100.00","31-33" +"8. Department of Pharmaceuticals","177.00","30.00","207.00","40.83","0.04","40.87","247.87","34-36" +"MINISTRY OF CIVIL AVIATION","179.90","6540.10","6720.00","657.98","...","657.98","7377.98","" +"9. Ministry of Civil Aviation","179.90","6540.10","6720.00","657.98","...","657.98","7377.98","37-39" +"MINISTRY OF COAL","550.00","...","550.00","50.00","...","50.00","600.00","" +"10. Ministry of Coal","550.00","...","550.00","50.00","...","50.00","600.00","40-41" +"MINISTRY OF COMMERCE AND INDUSTRY","3515.75","410.25","3926.00","3873.02","...","3873.02","7799.02","" +"11. Department of Commerce","1921.50","304.50","2226.00","3628.00","...","3628.00","5854.00","42-47" +"12. Department of Industrial Policy and Promotion","1594.25","105.75","1700.00","245.02","...","245.02","1945.02","48-52" +"MINISTRY OF COMMUNICATIONS AND INFORMATION TECHNOLOGY","7704.70","4410.30","12115.00","13953.92","101.00","14054.92","26169.92","" +"13. Department of Posts","286.70","513.30","800.00","6907.76","5.00","6912.76","7712.76","53-54" +"14. Department of Telecommunications","3798.00","3702.00","7500.00","6932.06","96.00","7028.06","14528.06","55-58" +"15. Department of Electronics and Information Technology","3620.00","195.00","3815.00","114.10","...","114.10","3929.10","59-63" +"MINISTRY OF CONSUMER AFFAIRS, FOOD AND PUBLIC DISTRIBUTION","344.45","205.55","550.00","115402.63","...","115402.63","115952.63","" +"16. Department of Consumer Affairs","192.70","27.30","220.00","75.79","...","75.79","295.79","64-66" +"17. Department of Food and Public Distribution","151.75","178.25","330.00","115326.84","...","115326.84","115656.84","67-70" +"MINISTRY OF CORPORATE AFFAIRS","22.76","1.24","24.00","208.75","22.50","231.25","255.25","" +"18. Ministry of Corporate Affairs","22.76","1.24","24.00","208.75","22.50","231.25","255.25","71-72" +"MINISTRY OF CULTURE","1767.00","68.00","1835.00","676.00","...","676.00","2511.00","" +"19. Ministry of Culture","1767.00","68.00","1835.00","676.00","...","676.00","2511.00","73-78" +"MINISTRY OF DEFENCE","...","...","...","188994.20","96208.67","285202.87","285202.87","" +"20. Ministry of Defence","...","...","...","3582.15","1620.72","5202.87","5202.87","79-80" +"21. Defence Pensions","...","...","...","51000.00","...","51000.00","51000.00","81-81" +"22. Defence Services-Army","...","...","...","92669.32","...","92669.32","92669.32","82-82" +"23. Defence Services-Navy","...","...","...","13975.79","...","13975.79","13975.79","83-83" +"24. Defence Services-Air Force","...","...","...","20506.84","...","20506.84","20506.84","84-84" +"25. Defence Ordnance Factories","...","...","...","1275.43","...","1275.43","1275.43","85-85" +"26. Defence Services – Research and Development","...","...","...","5984.67","...","5984.67","5984.67","86-86" +"27. Capital Outlay on Defence Services","...","...","...","...","94587.95","94587.95","94587.95","87-88" +"","","","","","","","SBE Summary of Contents","" diff --git a/docs/benchmark/stream/budget/budget-data-tabula.csv b/docs/benchmark/stream/budget/budget-data-tabula.csv new file mode 100755 index 0000000..4385259 --- /dev/null +++ b/docs/benchmark/stream/budget/budget-data-tabula.csv @@ -0,0 +1,43 @@ +"",Budget 2014-2015,,,,,,,(In crores of Rupees) +"",,,Plan,,,Non-Plan,,Total Plan & +Ministry/Demand,,Revenue,Capital,Total,Revenue,Capital,Total,Non-Plan Page No. +"",,,,,,,, +"",,,,,,,, +MINISTRY OF AGRICULTURE,,28130.48,67.52,28198.00,2863.09,1.85,2864.94,31062.94 +1.Department of Agriculture and Cooperation,,22260.55,48.45,22309.00,342.51,0.74,343.25,22652.25 1-10 +2.Department of Agricultural Research and Education,,3715.00,...,3715.00,2429.39,...,2429.39,6144.39 11-13 +"3.Department of Animal Husbandry, Dairying and Fisheries",,2154.93,19.07,2174.00,91.19,1.11,92.30,2266.30 14-19 +DEPARTMENT OF ATOMIC ENERGY,,1779.00,4101.00,5880.00,3710.84,855.75,4566.59,10446.59 +4.Atomic Energy,,1483.00,3427.00,4910.00,2971.25,855.75,3827.00,8737.00 20-25 +5.Nuclear Power Schemes,,296.00,674.00,970.00,739.59,...,739.59,1709.59 26-27 +MINISTRY OF CHEMICALS AND FERTILISERS,,360.83,153.17,514.00,73104.46,0.09,73104.55,73618.55 +6.Department of Chemicals and Petrochemicals,,171.49,35.51,207.00,63.67,0.01,63.68,270.68 28-30 +7.Department of Fertilisers,,12.34,87.66,100.00,72999.96,0.04,73000.00,73100.00 31-33 +8.Department of Pharmaceuticals,,177.00,30.00,207.00,40.83,0.04,40.87,247.87 34-36 +MINISTRY OF CIVIL AVIATION,,179.90,6540.10,6720.00,657.98,...,657.98,7377.98 +9.Ministry of Civil Aviation,,179.90,6540.10,6720.00,657.98,...,657.98,7377.98 37-39 +MINISTRY OF COAL,,550.00,...,550.00,50.00,...,50.00,600.00 +10.Ministry of Coal,,550.00,...,550.00,50.00,...,50.00,600.00 40-41 +MINISTRY OF COMMERCE AND INDUSTRY,,3515.75,410.25,3926.00,3873.02,...,3873.02,7799.02 +11.Department of Commerce,,1921.50,304.50,2226.00,3628.00,...,3628.00,5854.00 42-47 +12.Department of Industrial Policy and Promotion,,1594.25,105.75,1700.00,245.02,...,245.02,1945.02 48-52 +MINISTRY OF COMMUNICATIONS AND INFORMATION TECHNOLOGY,,7704.70,4410.30,12115.00,13953.92,101.00,14054.92,26169.92 +13.Department of Posts,,286.70,513.30,800.00,6907.76,5.00,6912.76,7712.76 53-54 +14.Department of Telecommunications,,3798.00,3702.00,7500.00,6932.06,96.00,7028.06,14528.06 55-58 +15.Department of Electronics and Information Technology,,3620.00,195.00,3815.00,114.10,...,114.10,3929.10 59-63 +"MINISTRY OF CONSUMER AFFAIRS, FOOD AND PUBLIC DISTRIBUTION",,344.45,205.55,550.00,115402.63,...,115402.63,115952.63 +16.Department of Consumer Affairs,,192.70,27.30,220.00,75.79,...,75.79,295.79 64-66 +17.Department of Food and Public Distribution,,151.75,178.25,330.00,115326.84,...,115326.84,115656.84 67-70 +MINISTRY OF CORPORATE AFFAIRS,,22.76,1.24,24.00,208.75,22.50,231.25,255.25 +18.Ministry of Corporate Affairs,,22.76,1.24,24.00,208.75,22.50,231.25,255.25 71-72 +MINISTRY OF CULTURE,,1767.00,68.00,1835.00,676.00,...,676.00,2511.00 +19.Ministry of Culture,,1767.00,68.00,1835.00,676.00,...,676.00,2511.00 73-78 +MINISTRY OF DEFENCE,,...,...,...,188994.20,96208.67,285202.87,285202.87 +20.Ministry of Defence,,...,...,...,3582.15,1620.72,5202.87,5202.87 79-80 +21.Defence Pensions,,...,...,...,51000.00,...,51000.00,51000.00 81-81 +22.Defence Services-Army,,...,...,...,92669.32,...,92669.32,92669.32 82-82 +23.Defence Services-Navy,,...,...,...,13975.79,...,13975.79,13975.79 83-83 +24.Defence Services-Air Force,,...,...,...,20506.84,...,20506.84,20506.84 84-84 +25.Defence Ordnance Factories,,...,...,...,1275.43,...,1275.43,1275.43 85-85 +26.Defence Services – Research and Development,,...,...,...,5984.67,...,5984.67,5984.67 86-86 +27.Capital Outlay on Defence Services,,...,...,...,...,94587.95,94587.95,94587.95 87-88 diff --git a/docs/benchmark/stream/budget/budget.pdf b/docs/benchmark/stream/budget/budget.pdf new file mode 100755 index 0000000..9466e87 Binary files /dev/null and b/docs/benchmark/stream/budget/budget.pdf differ diff --git a/docs/benchmark/stream/district_health/district_health-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/district_health/district_health-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..9251cf6 --- /dev/null +++ b/docs/benchmark/stream/district_health/district_health-data-camelot-page-1-table-1.csv @@ -0,0 +1,50 @@ +"Chandel- Key Indicators","","","","" +"","DLHS-4 (2012-13)","","DLHS-3 (2007-08)","" +"Indicators","TOTAL","RURAL","TOTAL","RURAL" +"Child feeding practices (based on last-born child in the reference period) (%)","","","","" +"Children age 0-5 months exclusively breastfed9 ..........................................................................","76.9","80.0","NA","NA" +"Children age 6-9 months receiving solid/semi-solid food and breast milk ....................................","78.6","75.0","85.9","89.3" +"Children age 12-23 months receiving breast feeding along with complementary feeding ...........","31.8","24.2","NA","NA" +"Children age 6-35 months exclusively breastfed for at least 6 months ........................................","4.7","3.4","30.0","27.7" +"Children under 3 years breastfed within one hour of birth ............................................................","42.9","46.5","50.6","52.9" +"Birth Weight (%) (age below 36 months)","","","","" +"Percentage of Children weighed at birth ......................................................................................","38.8","41.0","NA","NA" +"Percentage of Children with low birth weight (out of those who weighted) ( below 2.5 kg) .........","12.8","14.6","NA","NA" +"Awareness about Diarrhoea (%)","","","","" +"Awareness about ARI (%)","","","","" +"Women know about what to do when a child gets diarrhoea .....................................................","96.3","96.2","94.4","94.2" +"Women aware about danger signs of ARI10 .................................................................................","55.9","59.7","32.8","34.7" +"Treatment of childhood diseases (based on last two surviving children born during the","","","","" +"reference period) (%)","","","","" +"Prevalence of diarrhoea in last 2 weeks for under 5 years old children .......................................","1.6","1.3","6.5","7.0" +"Children with diarrhoea in the last 2 weeks and received ORS11 .................................................","100.0","100.0","54.8","53.3" +"Children with diarrhoea in the last 2 weeks and sought advice/treatment ...................................","100.0","50.0","72.9","73.3" +"Prevalence of ARI in last 2 weeks for under 5 years old children ............................................","4.3","3.9","3.9","4.2" +"Children with acute respiratory infection or fever in last 2 weeks and sought advice/treatment","37.5","33.3","69.8","68.0" +"Children with diarrhoea in the last 2 weeks given Zinc along with ORS ......................................","66.6","50.0","NA","NA" +"Awareness of RTI/STI and HIV/AIDS (%)","","","","" +"Women who have heard of RTI/STI .............................................................................................","55.8","57.1","34.8","38.2" +"Women who have heard of HIV/AIDS ..........................................................................................","98.9","99.0","98.3","98.1" +"Women who have any symptoms of RTI/STI ..............................................................................","13.9","13.5","15.6","16.1" +"Women who know the place to go for testing of HIV/AIDS12 .......................................................","59.9","57.1","48.6","46.3" +"Women underwent test for detecting HIV/AIDS12 ........................................................................","37.3","36.8","14.1","12.3" +"Utilization of Government Health Services (%)","","","","" +"Antenatal care ..............................................................................................................................","69.7","66.7","79.0","81.0" +"Treatment for pregnancy complications .......................................................................................","57.1","59.3","88.0","87.8" +"Treatment for post-delivery complications ...................................................................................","33.3","33.3","68.4","68.4" +"Treatment for vaginal discharge ...................................................................................................","20.0","25.0","73.9","71.4" +"Treatment for children with diarrhoea13 ........................................................................................","50.0","100.0","NA","NA" +"Treatment for children with ARI13 .................................................................................................","NA","NA","NA","NA" +"Birth Registration (%)","","","","" +"Children below age 5 years having birth registration done ..........................................................","40.6","44.3","NA","NA" +"Children below age 5 years who received birth certificate (out of those registered) ....................","65.9","63.6","NA","NA" +"Personal Habits (age 15 years and above) (%)","","","","" +"Men who use any kind of smokeless tobacco .............................................................................","74.6","74.2","NA","NA" +"Women who use any kind of smokeless tobacco ........................................................................","59.5","58.9","NA","NA" +"Men who smoke ...........................................................................................................................","56.0","56.4","NA","NA" +"Women who smoke ......................................................................................................................","18.4","18.0","NA","NA" +"Men who consume alcohol ...........................................................................................................","58.4","58.2","NA","NA" +"Women who consume alcohol .....................................................................................................","10.9","9.3","NA","NA" +"9 Children Who were given nothing but breast milk till the survey date 10Acute Respiratory Infections 11Oral Rehydration Solutions/Salts.12Based on","","","","" +"the women who have heard of HIV/AIDS.13 Last two weeks","","","","" +"3","","","","" diff --git a/docs/benchmark/stream/district_health/district_health-data-tabula.csv b/docs/benchmark/stream/district_health/district_health-data-tabula.csv new file mode 100755 index 0000000..bd7fd72 --- /dev/null +++ b/docs/benchmark/stream/district_health/district_health-data-tabula.csv @@ -0,0 +1,48 @@ +"",DLHS-4 (2012-13) DLHS-3 (2007-08) +Indicators,TOTAL RURAL TOTAL RURAL +Child feeding practices (based on last-born child in the reference period) (%), +Children age 0-5 months exclusively breastfed9 ..........................................................................,76.9 80.0 NA NA +Children age 6-9 months receiving solid/semi-solid food and breast milk ....................................,78.6 75.0 85.9 89.3 +Children age 12-23 months receiving breast feeding along with complementary feeding ...........,31.8 24.2 NA NA +Children age 6-35 months exclusively breastfed for at least 6 months ........................................,4.7 3.4 30.0 27.7 +Children under 3 years breastfed within one hour of birth ............................................................,42.9 46.5 50.6 52.9 +Birth Weight (%) (age below 36 months), +Percentage of Children weighed at birth ......................................................................................,38.8 41.0 NA NA +Percentage of Children with low birth weight (out of those who weighted) ( below 2.5 kg) .........,12.8 14.6 NA NA +Awareness about Diarrhoea (%), +Women know about what to do when a child gets diarrhoea .....................................................,96.3 96.2 94.4 94.2 +Awareness about ARI (%), +Women aware about danger signs of ARI10 .................................................................................,55.9 59.7 32.8 34.7 +Treatment of childhood diseases (based on last two surviving children born during the, +reference period) (%), +Prevalence of diarrhoea in last 2 weeks for under 5 years old children .......................................,1.6 1.3 6.5 7.0 +Children with diarrhoea in the last 2 weeks and received ORS11 .................................................,100.0 100.0 54.8 53.3 +Children with diarrhoea in the last 2 weeks and sought advice/treatment ...................................,100.0 50.0 72.9 73.3 +Prevalence of ARI in last 2 weeks for under 5 years old children ............................................,4.3 3.9 3.9 4.2 +Children with acute respiratory infection or fever in last 2 weeks and sought advice/treatment,37.5 33.3 69.8 68.0 +Children with diarrhoea in the last 2 weeks given Zinc along with ORS ......................................,66.6 50.0 NA NA +Awareness of RTI/STI and HIV/AIDS (%), +Women who have heard of RTI/STI .............................................................................................,55.8 57.1 34.8 38.2 +Women who have heard of HIV/AIDS ..........................................................................................,98.9 99.0 98.3 98.1 +Women who have any symptoms of RTI/STI ..............................................................................,13.9 13.5 15.6 16.1 +Women who know the place to go for testing of HIV/AIDS12 .......................................................,59.9 57.1 48.6 46.3 +Women underwent test for detecting HIV/AIDS12 ........................................................................,37.3 36.8 14.1 12.3 +Utilization of Government Health Services (%), +Antenatal care ..............................................................................................................................,69.7 66.7 79.0 81.0 +Treatment for pregnancy complications .......................................................................................,57.1 59.3 88.0 87.8 +Treatment for post-delivery complications ...................................................................................,33.3 33.3 68.4 68.4 +Treatment for vaginal discharge ...................................................................................................,20.0 25.0 73.9 71.4 +Treatment for children with diarrhoea13 ........................................................................................,50.0 100.0 NA NA +Treatment for children with ARI13 .................................................................................................,NA NA NA NA +Birth Registration (%), +Children below age 5 years having birth registration done ..........................................................,40.6 44.3 NA NA +Children below age 5 years who received birth certificate (out of those registered) ....................,65.9 63.6 NA NA +Personal Habits (age 15 years and above) (%), +Men who use any kind of smokeless tobacco .............................................................................,74.6 74.2 NA NA +Women who use any kind of smokeless tobacco ........................................................................,59.5 58.9 NA NA +Men who smoke ...........................................................................................................................,56.0 56.4 NA NA +Women who smoke ......................................................................................................................,18.4 18.0 NA NA +Men who consume alcohol ...........................................................................................................,58.4 58.2 NA NA +Women who consume alcohol .....................................................................................................,10.9 9.3 NA NA +9 Children Who were given nothing but breast milk till the survey date 10Acute Respiratory Infections 11Oral Rehydration Solutions/Salts.12Based on, +the women who have heard of HIV/AIDS.13 Last two weeks, diff --git a/docs/benchmark/stream/district_health/district_health.pdf b/docs/benchmark/stream/district_health/district_health.pdf new file mode 100755 index 0000000..3ab299b Binary files /dev/null and b/docs/benchmark/stream/district_health/district_health.pdf differ diff --git a/docs/benchmark/stream/health/health-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/health/health-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..311468e --- /dev/null +++ b/docs/benchmark/stream/health/health-data-camelot-page-1-table-1.csv @@ -0,0 +1,31 @@ +"","Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)","","","","","","" +"States-A","Revenue","","Capital","","Total","Others(1)","Total" +"","","","","","Revenue &","","" +"","Medical &","Family","Medical &","Family","","","" +"","","","","","Capital","","" +"","Public","Welfare","Public","Welfare","","","" +"","Health","","Health","","","","" +"Andhra Pradesh","47,824,589","9,967,837","1,275,000","15,000","59,082,426","14,898,243","73,980,669" +"Arunachal Pradesh","2,241,609","107,549","23,000","0","2,372,158","86,336","2,458,494" +"Assam","14,874,821","2,554,197","161,600","0","17,590,618","4,408,505","21,999,123" +"Bihar","21,016,708","4,332,141","5,329,000","0","30,677,849","2,251,571","32,929,420" +"Chhattisgarh","11,427,311","1,415,660","2,366,592","0","15,209,563","311,163","15,520,726" +"Delhi","28,084,780","411,700","4,550,000","0","33,046,480","5,000","33,051,480" +"Goa","4,055,567","110,000","330,053","0","4,495,620","12,560","4,508,180" +"Gujarat","26,328,400","6,922,900","12,664,000","42,000","45,957,300","455,860","46,413,160" +"Haryana","15,156,681","1,333,527","40,100","0","16,530,308","1,222,698","17,753,006" +"Himachal Pradesh","8,647,229","1,331,529","580,800","0","10,559,558","725,315","11,284,873" +"Jammu & Kashmir","14,411,984","270,840","3,188,550","0","17,871,374","166,229","18,037,603" +"Jharkhand","8,185,079","3,008,077","3,525,558","0","14,718,714","745,139","15,463,853" +"Karnataka","34,939,843","4,317,801","3,669,700","0","42,927,344","631,088","43,558,432" +"Kerala","27,923,965","3,985,473","929,503","0","32,838,941","334,640","33,173,581" +"Madhya Pradesh","28,459,540","4,072,016","3,432,711","0","35,964,267","472,139","36,436,406" +"Maharashtra","55,011,100","6,680,721","5,038,576","0","66,730,397","313,762","67,044,159" +"Manipur","2,494,600","187,700","897,400","0","3,579,700","0","3,579,700" +"Meghalaya","2,894,093","342,893","705,500","5,000","3,947,486","24,128","3,971,614" +"Mizoram","1,743,501","84,185","10,250","0","1,837,936","17,060","1,854,996" +"Nagaland","2,368,724","204,329","226,400","0","2,799,453","783,054","3,582,507" +"Odisha","14,317,179","2,552,292","1,107,250","0","17,976,721","451,438","18,428,159" +"Puducherry","4,191,757","52,249","192,400","0","4,436,406","2,173","4,438,579" +"Punjab","19,775,485","2,208,343","2,470,882","0","24,454,710","1,436,522","25,891,232" +"","Health Sector Financing by Centre and States/UTs in India [2009-10 to 2012-13](Revised) P a g e |23","","","","","","" diff --git a/docs/benchmark/stream/health/health-data-tabula.csv b/docs/benchmark/stream/health/health-data-tabula.csv new file mode 100755 index 0000000..29fc687 --- /dev/null +++ b/docs/benchmark/stream/health/health-data-tabula.csv @@ -0,0 +1,28 @@ +Table: 5,Public Health Outlay 2012-13 (Budget Estimates)(Rs. in 000), +States-A,Revenue Capital Total Others(1),Total +"",Medical & Family Medical & Family Revenue &, +"",Public Welfare Public Welfare Capital, +"",Health Health, +Andhra Pradesh,"47,824,589 9,967,837 1,275,000 15,000 59,082,426 14,898,243","73,980,669" +Arunachal Pradesh,"2,241,609 107,549 23,000 0 2,372,158 86,336","2,458,494" +Assam,"14,874,821 2,554,197 161,600 0 17,590,618 4,408,505","21,999,123" +Bihar,"21,016,708 4,332,141 5,329,000 0 30,677,849 2,251,571","32,929,420" +Chhattisgarh,"11,427,311 1,415,660 2,366,592 0 15,209,563 311,163","15,520,726" +Delhi,"28,084,780 411,700 4,550,000 0 33,046,480 5,000","33,051,480" +Goa,"4,055,567 110,000 330,053 0 4,495,620 12,560","4,508,180" +Gujarat,"26,328,400 6,922,900 12,664,000 42,000 45,957,300 455,860","46,413,160" +Haryana,"15,156,681 1,333,527 40,100 0 16,530,308 1,222,698","17,753,006" +Himachal Pradesh,"8,647,229 1,331,529 580,800 0 10,559,558 725,315","11,284,873" +Jammu & Kashmir,"14,411,984 270,840 3,188,550 0 17,871,374 166,229","18,037,603" +Jharkhand,"8,185,079 3,008,077 3,525,558 0 14,718,714 745,139","15,463,853" +Karnataka,"34,939,843 4,317,801 3,669,700 0 42,927,344 631,088","43,558,432" +Kerala,"27,923,965 3,985,473 929,503 0 32,838,941 334,640","33,173,581" +Madhya Pradesh,"28,459,540 4,072,016 3,432,711 0 35,964,267 472,139","36,436,406" +Maharashtra,"55,011,100 6,680,721 5,038,576 0 66,730,397 313,762","67,044,159" +Manipur,"2,494,600 187,700 897,400 0 3,579,700 0","3,579,700" +Meghalaya,"2,894,093 342,893 705,500 5,000 3,947,486 24,128","3,971,614" +Mizoram,"1,743,501 84,185 10,250 0 1,837,936 17,060","1,854,996" +Nagaland,"2,368,724 204,329 226,400 0 2,799,453 783,054","3,582,507" +Odisha,"14,317,179 2,552,292 1,107,250 0 17,976,721 451,438","18,428,159" +Puducherry,"4,191,757 52,249 192,400 0 4,436,406 2,173","4,438,579" +Punjab,"19,775,485 2,208,343 2,470,882 0 24,454,710 1,436,522","25,891,232" diff --git a/docs/benchmark/stream/health/health.pdf b/docs/benchmark/stream/health/health.pdf new file mode 100755 index 0000000..b9247ab Binary files /dev/null and b/docs/benchmark/stream/health/health.pdf differ diff --git a/docs/benchmark/stream/m27/m27-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/m27/m27-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..2d2539e --- /dev/null +++ b/docs/benchmark/stream/m27/m27-data-camelot-page-1-table-1.csv @@ -0,0 +1,47 @@ +"FEB","RUAR","Y 2014 M27 (BUS)","","ALPHABETIC LISTING BY T","YPE","","","","ABLPDM27" +"","","","","OF ACTIVE LICENSES","","","","","3/19/2014" +"","","","","OKLAHOMA ABLE COMMIS","SION","","","","" +"LICENSE","","","","PREMISE","","","","","" +"NUMBER","TYPE","DBA NAME","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" +"648765","AAA","ALLEGIANT AIR","ALLEGIANT AIR LLC","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","-","2014/12/03" +"","","","","7777 EAST APACHE","","","","","" +"648766","AAA","ALLEGIANT AIR","ALLEGIANT AIR LLC","STREET","TULSA","OK","74115","-","2014/12/16" +"82030","AAA","AMERICAN AIRLINES","AMERICAN AIRLINES INC","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","(405) 680-3701","2014/09/14" +"509462","AAA","AMERICAN AIRLINES","AMERICAN AIRLINES INC","7777 EAST APACHE DRIVE","TULSA","OK","74115","(918) 831-6302","2014/08/19" +"","","","AMERICAN EAGLE","","","","","","" +"509609","AAA","AMERICAN EAGLE","AIRLINES INC","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","(405) 680-3701","2014/08/19" +"","","","AMERICAN EAGLE","","","","","","" +"402986","AAA","AMERICAN EAGLE","AIRLINES INC","7777 EAST APACHE DRIVE","TULSA","OK","74115","(859) 767-3747","2014/10/22" +"","","","","WILL ROGERS AIRPORT","","","","","" +"79145","AAA","DELTA AIR LINES","DELTA AIR LINES INC","BOX 59975","OKLAHOMA CITY","OK","73159","(404) 773-9745","2014/05/11" +"600941","AAA","ENDEAVOR AIR","ENDEAVOR AIR INC","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","(901) 348-4100","2015/03/26" +"","","","","7100 TERMINAL DRIVE","","","","","" +"478482","AAA","EXPRESSJET AIRLINES","EXPRESSJET AIRLINES INC","WILL ROGERS AIRPORT","OKLAHOMA CITY","OK","73159","(832) 353-1201","2014/05/08" +"505981","AAA","SKYWEST AIRLINES","SKYWEST INC","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","(405) 634-3000","2014/05/28" +"429754","AAA","SOUTHWEST AIRLINES","SOUTHWEST AIRLINES CO","7100 TERMINAL DRIVE","OKLAHOMA CITY","OK","73159","(405) 682-4183","2015/02/15" +"","","TULSA INTERNATIONAL","","","","","","","" +"429755","AAA","AIRPORT","SOUTHWEST AIRLINES CO","7777 EAST APACHE DRIVE","TULSA","OK","74115","(918) 834-4495","2015/02/16" +"415051","AAA","UNITED AIRLINES","UNITED AIRLINES INC","7777 EAST APACHE DRIVE","TULSA","OK","74115","(872) 825-8309","2014/05/12" +"106719","AAA","UNITED AIRLINES","UNITED AIRLINES INC","WILL ROGERS AIRPORT","OKLAHOMA CITY","OK","73159","(872) 825-8309","2014/04/11" +"","","A SENSU JAPANESE","","7123 SOUTH 92ND EAST","","","","","" +"625422","BAW","RESTAURANT","INFORMAL PARTNERSHIP","AVENUE SUITE J","TULSA","OK","74133","(918) 252-0333","2015/02/14" +"","","ADAMO'S ROUTE 66","","2132 WEST GARY","","","","","" +"464828","BAW","ITALIAN VILLA","TADJ INC","BOULEVARD","CLINTON","OK","73601","(580) 323-5900","2015/02/11" +"","","","","12215 NORTH","","","","","" +"184066","BAW","AJANTA","CABAB N' CURRY INC","PENNSYLVANIA","OKLAHOMA CITY","OK","73120","(405) 752-5283","2014/07/27" +"","","","SAYRE LODGING","","","","","","" +"547693","BAW","AMERICINN OF SAYRE","ENTERPRISES LLC","2405 SOUTH EL CAMINO","SAYRE","OK","73662","(580) 928-2700","2014/09/08" +"","","ANDOLINI'S PIZZERIA &","","12140 EAST 96TH STREET","","","","","" +"428377","BAW","ITALIAN RESTAURANT","ANDOLINI'S LLC","NORTH #106","OWASSO","OK","74055","(918) 272-9325","2015/02/10" +"","","ASAHI JAPANESE","","","","","","","" +"446957","BAW","RESTAURANT","JIN CORPORATION","7831 EAST 71ST STREET","TULSA","OK","74133","(918) 307-9151","2014/12/22" +"","","","SMOKEHOUSE","","","","","","" +"632501","BAW","BACK DOOR BARBECUE","ASSOCIATES INC","315 NORTHWEST 23RD","OKLAHOMA CITY","OK","73103","-","2014/08/01" +"598515","BAW","BAMBOO THAI BISTRO","BAMBOO THAI BISTRO INC","5079 SOUTH YALE AVENUE","TULSA","OK","74135","(918) 828-0740","2015/03/11" +"","","BANDANA RED'S","","","","","","","" +"618693","BAW","STEAKHOUSE","BRADSHAW, STEVE_LEN","37808 OLD HIGHWAY 270","SHAWNEE","OK","74804","-","2014/08/20" +"","","","","1522 WEST LINDSEY","","","","","" +"632575","BAW","BASHU LEGENDS","HYH HE CHUANG LLC","STREET","NORMAN","OK","73069","-","2014/07/21" +"","","","DEEP FORK HOLDINGS","","","","","","" +"543149","BAW","BEDLAM BAR-B-Q","LLC","610 NORTHEAST 50TH","OKLAHOMA CITY","OK","73105","(405) 528-7427","2015/02/23" +"","","","","Page 1 of 151","","","","","" diff --git a/docs/benchmark/stream/m27/m27-data-tabula-0.csv b/docs/benchmark/stream/m27/m27-data-tabula-0.csv new file mode 100755 index 0000000..58648c8 --- /dev/null +++ b/docs/benchmark/stream/m27/m27-data-tabula-0.csv @@ -0,0 +1,41 @@ +648765 AAA,ALLEGIANT AIR,ALLEGIANT AIR LLC7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,-,2014/12/03 +"",,7777 EAST APACHE,,,,,, +648766 AAA,ALLEGIANT AIR,ALLEGIANT AIR LLCSTREET,TULSA,,OK,74115,-,2014/12/16 +82030 AAA,AMERICAN AIRLINES,AMERICAN AIRLINES INC7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,(405) 680-3701,2014/09/14 +509462 AAA,AMERICAN AIRLINES,AMERICAN AIRLINES INC7777 EAST APACHE DRIVE,TULSA,,OK,74115,(918) 831-6302,2014/08/19 +"",,AMERICAN EAGLE,,,,,, +509609 AAA,AMERICAN EAGLE,AIRLINES INC7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,(405) 680-3701,2014/08/19 +"",,AMERICAN EAGLE,,,,,, +402986 AAA,AMERICAN EAGLE,AIRLINES INC7777 EAST APACHE DRIVE,TULSA,,OK,74115,(859) 767-3747,2014/10/22 +"",,WILL ROGERS AIRPORT,,,,,, +79145 AAA,DELTA AIR LINES,DELTA AIR LINES INCBOX 59975,OKLAHOMA CITY,,OK,73159,(404) 773-9745,2014/05/11 +600941 AAA,ENDEAVOR AIR,ENDEAVOR AIR INC7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,(901) 348-4100,2015/03/26 +"",,7100 TERMINAL DRIVE,,,,,, +478482 AAA,EXPRESSJET AIRLINES,EXPRESSJET AIRLINES INC WILL ROGERS AIRPORT,OKLAHOMA CITY,,OK,73159,(832) 353-1201,2014/05/08 +505981 AAA,SKYWEST AIRLINES,SKYWEST INC7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,(405) 634-3000,2014/05/28 +429754 AAA,SOUTHWEST AIRLINES,SOUTHWEST AIRLINES CO 7100 TERMINAL DRIVE,OKLAHOMA CITY,,OK,73159,(405) 682-4183,2015/02/15 +"",TULSA INTERNATIONAL,,,,,,, +429755 AAA,AIRPORT,SOUTHWEST AIRLINES CO 7777 EAST APACHE DRIVE,TULSA,,OK,74115,(918) 834-4495,2015/02/16 +415051 AAA,UNITED AIRLINES,UNITED AIRLINES INC7777 EAST APACHE DRIVE,TULSA,,OK,74115,(872) 825-8309,2014/05/12 +106719 AAA,UNITED AIRLINES,UNITED AIRLINES INCWILL ROGERS AIRPORT,OKLAHOMA CITY,,OK,73159,(872) 825-8309,2014/04/11 +"",A SENSU JAPANESE,7123 SOUTH 92ND EAST,,,,,, +625422 BAW,RESTAURANT,INFORMAL PARTNERSHIPAVENUE SUITE J,TULSA,,OK,74133,(918) 252-0333,2015/02/14 +"",ADAMO'S ROUTE 66,2132 WEST GARY,,,,,, +464828 BAW,ITALIAN VILLA,TADJ INCBOULEVARD,CLINTON,,OK,73601,(580) 323-5900,2015/02/11 +"",,12215 NORTH,,,,,, +184066 BAW,AJANTA,CABAB N' CURRY INCPENNSYLVANIA,OKLAHOMA CITY,,OK,73120,(405) 752-5283,2014/07/27 +"",,SAYRE LODGING,,,,,, +547693 BAW,AMERICINN OF SAYRE,ENTERPRISES LLC2405 SOUTH EL CAMINO,SAYRE,,OK,73662,(580) 928-2700,2014/09/08 +"",ANDOLINI'S PIZZERIA &,12140 EAST 96TH STREET,,,,,, +428377 BAW,ITALIAN RESTAURANT,ANDOLINI'S LLCNORTH #106,OWASSO,,OK,74055,(918) 272-9325,2015/02/10 +"",ASAHI JAPANESE,,,,,,, +446957 BAW,RESTAURANT,JIN CORPORATION7831 EAST 71ST STREET,TULSA,,OK,74133,(918) 307-9151,2014/12/22 +"",,SMOKEHOUSE,,,,,, +632501 BAW,BACK DOOR BARBECUE,ASSOCIATES INC315 NORTHWEST 23RD,OKLAHOMA CITY,,OK,73103,-,2014/08/01 +598515 BAW,BAMBOO THAI BISTRO,BAMBOO THAI BISTRO INC 5079 SOUTH YALE AVENUE TULSA,,,OK,74135,(918) 828-0740,2015/03/11 +"",BANDANA RED'S,,,,,,, +618693 BAW,STEAKHOUSE,"BRADSHAW, STEVE_LEN37808 OLD HIGHWAY 270",SHAWNEE,,OK,74804,-,2014/08/20 +"",,1522 WEST LINDSEY,,,,,, +632575 BAW,BASHU LEGENDS,HYH HE CHUANG LLCSTREET,NORMAN,,OK,73069,-,2014/07/21 +"",,DEEP FORK HOLDINGS,,,,,, +543149 BAW,BEDLAM BAR-B-Q,LLC610 NORTHEAST 50TH,OKLAHOMA CITY,,OK,73105,(405) 528-7427,2015/02/23 diff --git a/docs/benchmark/stream/m27/m27.pdf b/docs/benchmark/stream/m27/m27.pdf new file mode 100755 index 0000000..cecd7b6 Binary files /dev/null and b/docs/benchmark/stream/m27/m27.pdf differ diff --git a/docs/benchmark/stream/mexican_towns/mexican_towns-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/mexican_towns/mexican_towns-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..e2a6a45 --- /dev/null +++ b/docs/benchmark/stream/mexican_towns/mexican_towns-data-camelot-page-1-table-1.csv @@ -0,0 +1,44 @@ +"Clave","","Clave","","","Clave","" +"","Nombre Entidad","","","Nombre Municipio","","Nombre Localidad" +"Entidad","","Municipio","","","Localidad","" +"01","Aguascalientes","001","Aguascalientes","","0094","Granja Adelita" +"01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" +"01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" +"01","Aguascalientes","001","Aguascalientes","","0102","Los Arbolitos [Rancho]" +"01","Aguascalientes","001","Aguascalientes","","0104","Ardillas de Abajo (Las Ardillas)" +"01","Aguascalientes","001","Aguascalientes","","0106","Arellano" +"01","Aguascalientes","001","Aguascalientes","","0112","Bajío los Vázquez" +"01","Aguascalientes","001","Aguascalientes","","0113","Bajío de Montoro" +"01","Aguascalientes","001","Aguascalientes","","0114","Residencial San Nicolás [Baños la Cantera]" +"01","Aguascalientes","001","Aguascalientes","","0120","Buenavista de Peñuelas" +"01","Aguascalientes","001","Aguascalientes","","0121","Cabecita 3 Marías (Rancho Nuevo)" +"01","Aguascalientes","001","Aguascalientes","","0125","Cañada Grande de Cotorina" +"01","Aguascalientes","001","Aguascalientes","","0126","Cañada Honda [Estación]" +"01","Aguascalientes","001","Aguascalientes","","0127","Los Caños" +"01","Aguascalientes","001","Aguascalientes","","0128","El Cariñán" +"01","Aguascalientes","001","Aguascalientes","","0129","El Carmen [Granja]" +"01","Aguascalientes","001","Aguascalientes","","0135","El Cedazo (Cedazo de San Antonio)" +"01","Aguascalientes","001","Aguascalientes","","0138","Centro de Arriba (El Taray)" +"01","Aguascalientes","001","Aguascalientes","","0139","Cieneguilla (La Lumbrera)" +"01","Aguascalientes","001","Aguascalientes","","0141","Cobos" +"01","Aguascalientes","001","Aguascalientes","","0144","El Colorado (El Soyatal)" +"01","Aguascalientes","001","Aguascalientes","","0146","El Conejal" +"01","Aguascalientes","001","Aguascalientes","","0157","Cotorina de Abajo" +"01","Aguascalientes","001","Aguascalientes","","0162","Coyotes" +"01","Aguascalientes","001","Aguascalientes","","0166","La Huerta (La Cruz)" +"01","Aguascalientes","001","Aguascalientes","","0170","Cuauhtémoc (Las Palomas)" +"01","Aguascalientes","001","Aguascalientes","","0171","Los Cuervos (Los Ojos de Agua)" +"01","Aguascalientes","001","Aguascalientes","","0172","San José [Granja]" +"01","Aguascalientes","001","Aguascalientes","","0176","La Chiripa" +"01","Aguascalientes","001","Aguascalientes","","0182","Dolores" +"01","Aguascalientes","001","Aguascalientes","","0183","Los Dolores" +"01","Aguascalientes","001","Aguascalientes","","0190","El Duraznillo" +"01","Aguascalientes","001","Aguascalientes","","0191","Los Durón" +"01","Aguascalientes","001","Aguascalientes","","0197","La Escondida" +"01","Aguascalientes","001","Aguascalientes","","0201","Brande Vin [Bodegas]" +"01","Aguascalientes","001","Aguascalientes","","0207","Valle Redondo" +"01","Aguascalientes","001","Aguascalientes","","0209","La Fortuna" +"01","Aguascalientes","001","Aguascalientes","","0212","Lomas del Gachupín" +"01","Aguascalientes","001","Aguascalientes","","0213","El Carmen (Gallinas Güeras) [Rancho]" +"01","Aguascalientes","001","Aguascalientes","","0216","La Gloria" +"01","Aguascalientes","001","Aguascalientes","","0226","Hacienda Nueva" diff --git a/docs/benchmark/stream/mexican_towns/mexican_towns-data-tabula.csv b/docs/benchmark/stream/mexican_towns/mexican_towns-data-tabula.csv new file mode 100755 index 0000000..9fc5afb --- /dev/null +++ b/docs/benchmark/stream/mexican_towns/mexican_towns-data-tabula.csv @@ -0,0 +1,41 @@ +01,Aguascalientes,001,Aguascalientes,0094,Granja Adelita +01,Aguascalientes,001,Aguascalientes,0096,Agua Azul +01,Aguascalientes,001,Aguascalientes,0100,Rancho Alegre +01,Aguascalientes,001,Aguascalientes,0102,Los Arbolitos [Rancho] +01,Aguascalientes,001,Aguascalientes,0104,Ardillas de Abajo (Las Ardillas) +01,Aguascalientes,001,Aguascalientes,0106,Arellano +01,Aguascalientes,001,Aguascalientes,0112,Bajío los Vázquez +01,Aguascalientes,001,Aguascalientes,0113,Bajío de Montoro +01,Aguascalientes,001,Aguascalientes,0114,Residencial San Nicolás [Baños la Cantera] +01,Aguascalientes,001,Aguascalientes,0120,Buenavista de Peñuelas +01,Aguascalientes,001,Aguascalientes,0121,Cabecita 3 Marías (Rancho Nuevo) +01,Aguascalientes,001,Aguascalientes,0125,Cañada Grande de Cotorina +01,Aguascalientes,001,Aguascalientes,0126,Cañada Honda [Estación] +01,Aguascalientes,001,Aguascalientes,0127,Los Caños +01,Aguascalientes,001,Aguascalientes,0128,El Cariñán +01,Aguascalientes,001,Aguascalientes,0129,El Carmen [Granja] +01,Aguascalientes,001,Aguascalientes,0135,El Cedazo (Cedazo de San Antonio) +01,Aguascalientes,001,Aguascalientes,0138,Centro de Arriba (El Taray) +01,Aguascalientes,001,Aguascalientes,0139,Cieneguilla (La Lumbrera) +01,Aguascalientes,001,Aguascalientes,0141,Cobos +01,Aguascalientes,001,Aguascalientes,0144,El Colorado (El Soyatal) +01,Aguascalientes,001,Aguascalientes,0146,El Conejal +01,Aguascalientes,001,Aguascalientes,0157,Cotorina de Abajo +01,Aguascalientes,001,Aguascalientes,0162,Coyotes +01,Aguascalientes,001,Aguascalientes,0166,La Huerta (La Cruz) +01,Aguascalientes,001,Aguascalientes,0170,Cuauhtémoc (Las Palomas) +01,Aguascalientes,001,Aguascalientes,0171,Los Cuervos (Los Ojos de Agua) +01,Aguascalientes,001,Aguascalientes,0172,San José [Granja] +01,Aguascalientes,001,Aguascalientes,0176,La Chiripa +01,Aguascalientes,001,Aguascalientes,0182,Dolores +01,Aguascalientes,001,Aguascalientes,0183,Los Dolores +01,Aguascalientes,001,Aguascalientes,0190,El Duraznillo +01,Aguascalientes,001,Aguascalientes,0191,Los Durón +01,Aguascalientes,001,Aguascalientes,0197,La Escondida +01,Aguascalientes,001,Aguascalientes,0201,Brande Vin [Bodegas] +01,Aguascalientes,001,Aguascalientes,0207,Valle Redondo +01,Aguascalientes,001,Aguascalientes,0209,La Fortuna +01,Aguascalientes,001,Aguascalientes,0212,Lomas del Gachupín +01,Aguascalientes,001,Aguascalientes,0213,El Carmen (Gallinas Güeras) [Rancho] +01,Aguascalientes,001,Aguascalientes,0216,La Gloria +01,Aguascalientes,001,Aguascalientes,0226,Hacienda Nueva diff --git a/docs/benchmark/stream/mexican_towns/mexican_towns.pdf b/docs/benchmark/stream/mexican_towns/mexican_towns.pdf new file mode 100755 index 0000000..46cd236 Binary files /dev/null and b/docs/benchmark/stream/mexican_towns/mexican_towns.pdf differ diff --git a/docs/benchmark/stream/missing_values/missing_values-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/missing_values/missing_values-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..18e75d9 --- /dev/null +++ b/docs/benchmark/stream/missing_values/missing_values-data-camelot-page-1-table-1.csv @@ -0,0 +1,44 @@ +"Bhandara - Key Indicators","","","","" +"","DLHS-4 (2012-13)","","DLHS-3 (2007-08)","" +"Indicators","TOTAL","RURAL","TOTAL","RURAL" +"Reported Prevalence of Morbidity","","","","" +"Any Injury .....................................................................................................................................","1.9","2.1","","" +"Acute Illness .................................................................................................................................","4.5","5.6","","" +"Chronic Illness ..............................................................................................................................","5.1","4.1","","" +"Reported Prevalence of Chronic Illness during last one year (%)","","","","" +"Disease of respiratory system ......................................................................................................","11.7","15.0","","" +"Disease of cardiovascular system ................................................................................................","8.9","9.3","","" +"Persons suffering from tuberculosis .............................................................................................","2.2","1.5","","" +"Anaemia Status by Haemoglobin Level14 (%)","","","","" +"Children (6-59 months) having anaemia ......................................................................................","68.5","71.9","","" +"Children (6-59 months) having severe anaemia ..........................................................................","6.7","9.4","","" +"Children (6-9 Years) having anaemia - Male ................................................................................","67.1","71.4","","" +"Children (6-9 Years) having severe anaemia - Male ....................................................................","4.4","2.4","","" +"Children (6-9 Years) having anaemia - Female ...........................................................................","52.4","48.8","","" +"Children (6-9 Years) having severe anaemia - Female ................................................................","1.2","0.0","","" +"Children (6-14 years) having anaemia - Male .............................................................................","50.8","62.5","","" +"Children (6-14 years) having severe anaemia - Male ..................................................................","3.7","3.6","","" +"Children (6-14 years) having anaemia - Female .........................................................................","48.3","50.0","","" +"Children (6-14 years) having severe anaemia - Female ..............................................................","4.3","6.1","","" +"Children (10-19 Years15) having anaemia - Male .........................................................................","37.9","51.2","","" +"Children (10-19 Years15) having severe anaemia - Male .............................................................","3.5","4.0","","" +"Children (10-19 Years15) having anaemia - Female .....................................................................","46.6","52.1","","" +"Children (10-19 Years15) having severe anaemia - Female .........................................................","6.4","6.5","","" +"Adolescents (15-19 years) having anaemia ................................................................................","39.4","46.5","","" +"Adolescents (15-19 years) having severe anaemia .....................................................................","5.4","5.1","","" +"Pregnant women (15-49 aged) having anaemia ..........................................................................","48.8","51.5","","" +"Pregnant women (15-49 aged) having severe anaemia ..............................................................","7.1","8.8","","" +"Women (15-49 aged) having anaemia .........................................................................................","45.2","51.7","","" +"Women (15-49 aged) having severe anaemia .............................................................................","4.8","5.9","","" +"Persons (20 years and above) having anaemia ...........................................................................","37.8","42.1","","" +"Persons (20 years and above) having Severe anaemia ..............................................................","4.6","4.8","","" +"Blood Sugar Level (age 18 years and above) (%)","","","","" +"Blood Sugar Level >140 mg/dl (high) ...........................................................................................","12.9","11.1","","" +"Blood Sugar Level >160 mg/dl (very high) ...................................................................................","7.0","5.1","","" +"Hypertension (age 18 years and above) (%)","","","","" +"Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................","23.8","22.8","","" +"Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","","" +"Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","","" +"14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","","" +"Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","","" +"4","","","","" diff --git a/docs/benchmark/stream/missing_values/missing_values-data-tabula.csv b/docs/benchmark/stream/missing_values/missing_values-data-tabula.csv new file mode 100755 index 0000000..826209d --- /dev/null +++ b/docs/benchmark/stream/missing_values/missing_values-data-tabula.csv @@ -0,0 +1,42 @@ +"",DLHS-4 (2012-13),DLHS-3 (2007-08) +Indicators,TOTAL RURAL,TOTAL RURAL +Reported Prevalence of Morbidity,, +Any Injury .....................................................................................................................................,1.9 2.1, +Acute Illness .................................................................................................................................,4.5 5.6, +Chronic Illness ..............................................................................................................................,5.1 4.1, +Reported Prevalence of Chronic Illness during last one year (%),, +Disease of respiratory system ......................................................................................................,11.7 15.0, +Disease of cardiovascular system ................................................................................................,8.9 9.3, +Persons suffering from tuberculosis .............................................................................................,2.2 1.5, +Anaemia Status by Haemoglobin Level14 (%),, +Children (6-59 months) having anaemia ......................................................................................,68.5 71.9, +Children (6-59 months) having severe anaemia ..........................................................................,6.7 9.4, +Children (6-9 Years) having anaemia - Male ................................................................................,67.1 71.4, +Children (6-9 Years) having severe anaemia - Male ....................................................................,4.4 2.4, +Children (6-9 Years) having anaemia - Female ...........................................................................,52.4 48.8, +Children (6-9 Years) having severe anaemia - Female ................................................................,1.2 0.0, +Children (6-14 years) having anaemia - Male .............................................................................,50.8 62.5, +Children (6-14 years) having severe anaemia - Male ..................................................................,3.7 3.6, +Children (6-14 years) having anaemia - Female .........................................................................,48.3 50.0, +Children (6-14 years) having severe anaemia - Female ..............................................................,4.3 6.1, +Children (10-19 Years15) having anaemia - Male .........................................................................,37.9 51.2, +Children (10-19 Years15) having severe anaemia - Male .............................................................,3.5 4.0, +Children (10-19 Years15) having anaemia - Female .....................................................................,46.6 52.1, +Children (10-19 Years15) having severe anaemia - Female .........................................................,6.4 6.5, +Adolescents (15-19 years) having anaemia ................................................................................,39.4 46.5, +Adolescents (15-19 years) having severe anaemia .....................................................................,5.4 5.1, +Pregnant women (15-49 aged) having anaemia ..........................................................................,48.8 51.5, +Pregnant women (15-49 aged) having severe anaemia ..............................................................,7.1 8.8, +Women (15-49 aged) having anaemia .........................................................................................,45.2 51.7, +Women (15-49 aged) having severe anaemia .............................................................................,4.8 5.9, +Persons (20 years and above) having anaemia ...........................................................................,37.8 42.1, +Persons (20 years and above) having Severe anaemia ..............................................................,4.6 4.8, +Blood Sugar Level (age 18 years and above) (%),, +Blood Sugar Level >140 mg/dl (high) ...........................................................................................,12.9 11.1, +Blood Sugar Level >160 mg/dl (very high) ...................................................................................,7.0 5.1, +Hypertension (age 18 years and above) (%),, +Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................,23.8 22.8, +Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................,8.2 7.1, +Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................,3.7 3.1, +"14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years",, +Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness,, diff --git a/docs/benchmark/stream/missing_values/missing_values.pdf b/docs/benchmark/stream/missing_values/missing_values.pdf new file mode 100755 index 0000000..90b620f Binary files /dev/null and b/docs/benchmark/stream/missing_values/missing_values.pdf differ diff --git a/docs/benchmark/stream/population_growth/population_growth-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/population_growth/population_growth-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..b642e18 --- /dev/null +++ b/docs/benchmark/stream/population_growth/population_growth-data-camelot-page-1-table-1.csv @@ -0,0 +1,42 @@ +"","TABLE 4: STATE-WISE DECADAL GROWTH RATE OF POPULATION","","","","","" +"","","","","","","(Per cent)" +"States/Union Territories","1951-1961","1961-1971","1971-1981","1981-1991","1991-2001","2001-2011" +"Andaman and Nicobar Islands","105.19","81.17","63.93","48.70","26.90","6.68" +"Andhra Pradesh","15.65","20.90","23.10","24.20","14.59","11.10" +"Arunachal Pradesh","-","38.91","35.15","36.83","27.00","25.92" +"Assam","34.98","34.95","23.36","24.24","18.92","16.93" +"Bihar","19.79","20.91","24.16","23.38","28.62","25.07" +"Chandigarh","394.13","114.59","75.55","42.16","40.28","17.10" +"Chhattisgarh","22.77","27.12","20.39","25.73","18.27","22.59" +"Dadra and Nagar Haveli","39.56","27.96","39.78","33.57","59.22","55.50" +"Daman and Diu","-24.56","70.85","26.07","28.62","55.73","53.54" +"NCT of Delhi","52.44","52.93","53.00","51.45","47.02","20.96" +"Goa","7.77","34.77","26.74","16.08","15.21","8.17" +"Gujarat","26.88","29.39","27.67","21.19","22.66","19.17" +"Haryana","33.79","32.22","28.75","27.41","28.43","19.90" +"Himachal Pradesh","17.87","23.04","23.71","20.79","17.54","12.81" +"Jammu and Kashmir","9.44","29.65","29.69","30.89","29.43","23.71" +"Jharkhand","19.69","22.58","23.79","24.03","23.36","22.34" +"Karnataka","21.57","24.22","26.75","21.12","17.51","15.67" +"Kerala","24.76","26.29","19.24","14.32","9.43","4.86" +"Lakshadweep","14.61","31.95","26.53","28.47","17.30","6.23" +"Madhya Pradesh","24.73","29.28","27.16","27.24","24.26","20.30" +"Maharashtra","23.60","27.45","24.54","25.73","22.73","15.99" +"Manipur","35.04","37.53","32.46","29.29","24.86","18.65" +"Meghalaya","27.03","31.50","32.04","32.86","30.65","27.82" +"Mizoram","35.61","24.93","48.55","39.70","28.82","22.78" +"Nagaland","14.07","39.88","50.05","56.08","64.53","-0.47" +"Odisha","19.82","25.05","20.17","20.06","16.25","13.97" +"Puducherry","16.34","27.81","28.15","33.64","20.62","27.72" +"Punjab","21.56","21.70","23.89","20.81","20.10","13.73" +"Rajasthan","26.20","27.83","32.97","28.44","28.41","21.44" +"Sikkim","17.76","29.38","50.77","28.47","33.06","12.36" +"Tamil Nadu","11.85","22.30","17.50","15.39","11.72","15.60" +"Tripura","78.71","36.28","31.92","34.30","16.03","14.75" +"Uttar Pradesh","16.38","19.54","25.39","25.61","25.85","20.09" +"Uttarakhand","22.57","24.42","27.45","23.13","20.41","19.17" +"West Bengal","32.80","26.87","23.17","24.73","17.77","13.93" +"ALL INDIA","21.51","24.80","24.66","23.87","21.54","17.64" +"‘-’: Not Available.","","","","","","" +"Source : Registrar General of India, Ministry of Home Affairs, Government of India.","","","","","","" +"","","6","","","","" diff --git a/docs/benchmark/stream/population_growth/population_growth-data-tabula.csv b/docs/benchmark/stream/population_growth/population_growth-data-tabula.csv new file mode 100755 index 0000000..defbfad --- /dev/null +++ b/docs/benchmark/stream/population_growth/population_growth-data-tabula.csv @@ -0,0 +1,37 @@ +States/Union Territories,1951-1961,1961-1971,1971-1981,1981-1991,1991-2001,2001-2011 +Andaman and Nicobar Islands,105.19,81.17,63.93,48.70,26.90,6.68 +Andhra Pradesh,15.65,20.90,23.10,24.20,14.59,11.10 +Arunachal Pradesh,-,38.91,35.15,36.83,27.00,25.92 +Assam,34.98,34.95,23.36,24.24,18.92,16.93 +Bihar,19.79,20.91,24.16,23.38,28.62,25.07 +Chandigarh,394.13,114.59,75.55,42.16,40.28,17.10 +Chhattisgarh,22.77,27.12,20.39,25.73,18.27,22.59 +Dadra and Nagar Haveli,39.56,27.96,39.78,33.57,59.22,55.50 +Daman and Diu,-24.56,70.85,26.07,28.62,55.73,53.54 +NCT of Delhi,52.44,52.93,53.00,51.45,47.02,20.96 +Goa,7.77,34.77,26.74,16.08,15.21,8.17 +Gujarat,26.88,29.39,27.67,21.19,22.66,19.17 +Haryana,33.79,32.22,28.75,27.41,28.43,19.90 +Himachal Pradesh,17.87,23.04,23.71,20.79,17.54,12.81 +Jammu and Kashmir,9.44,29.65,29.69,30.89,29.43,23.71 +Jharkhand,19.69,22.58,23.79,24.03,23.36,22.34 +Karnataka,21.57,24.22,26.75,21.12,17.51,15.67 +Kerala,24.76,26.29,19.24,14.32,9.43,4.86 +Lakshadweep,14.61,31.95,26.53,28.47,17.30,6.23 +Madhya Pradesh,24.73,29.28,27.16,27.24,24.26,20.30 +Maharashtra,23.60,27.45,24.54,25.73,22.73,15.99 +Manipur,35.04,37.53,32.46,29.29,24.86,18.65 +Meghalaya,27.03,31.50,32.04,32.86,30.65,27.82 +Mizoram,35.61,24.93,48.55,39.70,28.82,22.78 +Nagaland,14.07,39.88,50.05,56.08,64.53,-0.47 +Odisha,19.82,25.05,20.17,20.06,16.25,13.97 +Puducherry,16.34,27.81,28.15,33.64,20.62,27.72 +Punjab,21.56,21.70,23.89,20.81,20.10,13.73 +Rajasthan,26.20,27.83,32.97,28.44,28.41,21.44 +Sikkim,17.76,29.38,50.77,28.47,33.06,12.36 +Tamil Nadu,11.85,22.30,17.50,15.39,11.72,15.60 +Tripura,78.71,36.28,31.92,34.30,16.03,14.75 +Uttar Pradesh,16.38,19.54,25.39,25.61,25.85,20.09 +Uttarakhand,22.57,24.42,27.45,23.13,20.41,19.17 +West Bengal,32.80,26.87,23.17,24.73,17.77,13.93 +ALL INDIA,21.51,24.80,24.66,23.87,21.54,17.64 diff --git a/docs/benchmark/stream/population_growth/population_growth.pdf b/docs/benchmark/stream/population_growth/population_growth.pdf new file mode 100755 index 0000000..41a166e Binary files /dev/null and b/docs/benchmark/stream/population_growth/population_growth.pdf differ diff --git a/docs/benchmark/stream/superscript/superscript-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/superscript/superscript-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..1bd4857 --- /dev/null +++ b/docs/benchmark/stream/superscript/superscript-data-camelot-page-1-table-1.csv @@ -0,0 +1,41 @@ +"","TABLE 125: STATE-WISE COMPOSITION OF OUTSTANDING LIABILITIES - 1997 (Contd.)","","","","","","","","","" +"","","","","(As at end-March)","","","","","","" +"","","","","","","","","","","(` Billion)" +"States","Total","Market","NSSF","WMA","Loans","Loans","Loans","Loans","Loans","Loans" +"","Internal","Loans","","from","from","from","from","from","from SBI","from" +"","Debt","","","RBI","Banks","LIC","GIC","NABARD","& Other","NCDC" +"","","","","","& FIs","","","","Banks","" +"1","2=","3","4","5","6=","7","8","9","10","11" +"","(3 to 6)+14","","","","(7 to13)","","","","","" +"Andhra Pradesh","48.11","40.45","-","3.26","4.4","2.62","-","0.91","-","0.25" +"Arunachal Pradesh","1.23","1.1","-","-","0.13","-","-","-","-","-" +"Assam","12.69","10.02","-","2.41","0.26","0.08","-","-0.06","0.01","0.24" +"Bihar","40.75","41.54","-","-","-1.42","0.19","-","-1.01","-0.36","0.2" +"Chhattisgarh","-","-","-","-","-","-","-","-","-","-" +"Goa","1.4","1.02","-","-","0.38","0.31","-","0.07","-","-" +"Gujarat","19.75","17.1","-","-","2.64","1.17","-","1.11","-","0.44" +"Haryana","11.53","9.67","-","0.06","1.8","0.55","-","0.64","-","0.49" +"Himachal Pradesh","8.02","2.94","-","4.55","0.53","0.13","-","0.05","-","0.25" +"Jammu and Kashmir","11.72","4.49","-","-","7.23","0.66","-","0.02","6.08","-" +"Jharkhand","-","-","-","-","-","-","-","-","-","-" +"Karnataka","22.44","19.59","-","-","2.86","1.22","-","0.89","-","0.69" +"Kerala","29.03","24.912","-","-","4.11","1.77","-","0.48","-","1.45" +"Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" +"Maharashtra","30.47","26.07","-","-","4.39","0.21","-","-0.12","0.02","2.89" +"Manipur","2.17","1.61","-","0.26","0.29","0.08","-","-","-","0.09" +"Meghalaya","1.36","1.38","-","-","-0.02","0.04","-","-0.05","-","0.03" +"Mizoram","1.17","0.46","-","0.27","0.43","0.11","-","-","-","0.03" +"Nagaland","2.99","2.6","-","-","0.39","0.24","-","-","-","0.04" +"Odisha","34.04","27.58","-","4.4","2.06","0.56","-","0.66","-","0.2" +"Punjab","19.18","10.93","-","1.03","7.23","0.17","-","0.71","5.9","0.46" +"Rajasthan","36.77","28.63","-","4.99","3.16","0.57","-","1.64","-","0.81" +"Sikkim","0.16","-","-","-","0.16","0.03","-","-","-","0.01" +"Tamil Nadu","34.11","31.41","-","-","2.7","1.3","-","0.6","-","0.68" +"Tripura","2.3","1.89","-","-","0.41","0.41","-","-0.05","-","0.02" +"Uttaranchal","-","-","-","-","-","-","-","-","-","-" +"Uttar Pradesh","80.62","74.89","-","4.34","1.34","0.6","-","-0.21","0.18","0.03" +"West Bengal","34.23","32.19","-","-","2.04","0.77","-","0.06","-","0.51" +"NCT Delhi","-","-","-","-","-","-","-","-","-","-" +"ALL STATES","513.38","436.02","-","25.57","51.06","14.18","-","8.21","11.83","11.08" +"2 Includes `2.45 crore outstanding under “Market Loan Suspense”.","","","","","","","","","","" +"","","","","445","","","","","","" diff --git a/docs/benchmark/stream/superscript/superscript-data-tabula.csv b/docs/benchmark/stream/superscript/superscript-data-tabula.csv new file mode 100755 index 0000000..8650656 --- /dev/null +++ b/docs/benchmark/stream/superscript/superscript-data-tabula.csv @@ -0,0 +1,36 @@ +States,Total,Market,NSSF,WMA,Loans,Loans,Loans,Loans,Loans,Loans +"",Internal,Loans,,from,from,from,from,from,from SBI,from +"",Debt,,,RBI,Banks,LIC,GIC,NABARD,& Other,NCDC +"",,,,,& FIs,,,,Banks, +1,2=,3,4,5,6=,7,8,9,10,11 +"",(3 to 6)+14,,,,(7 to13),,,,, +Andhra Pradesh,48.11,40.45,-,3.26,4.4,2.62,-,0.91,-,0.25 +Arunachal Pradesh,1.23,1.1,-,-,0.13,-,-,-,-,- +Assam,12.69,10.02,-,2.41,0.26,0.08,-,-0.06,0.01,0.24 +Bihar,40.75,41.54,-,-,-1.42,0.19,-,-1.01,-0.36,0.2 +Chhattisgarh,-,-,-,-,-,-,-,-,-,- +Goa,1.4,1.02,-,-,0.38,0.31,-,0.07,-,- +Gujarat,19.75,17.1,-,-,2.64,1.17,-,1.11,-,0.44 +Haryana,11.53,9.67,-,0.06,1.8,0.55,-,0.64,-,0.49 +Himachal Pradesh,8.02,2.94,-,4.55,0.53,0.13,-,0.05,-,0.25 +Jammu and Kashmir,11.72,4.49,-,-,7.23,0.66,-,0.02,6.08,- +Jharkhand,-,-,-,-,-,-,-,-,-,- +Karnataka,22.44,19.59,-,-,2.86,1.22,-,0.89,-,0.69 +Kerala,29.03,24.912,-,-,4.11,1.77,-,0.48,-,1.45 +Madhya Pradesh,27.13,23.57,-,-,3.56,0.38,-,1.86,-,1.28 +Maharashtra,30.47,26.07,-,-,4.39,0.21,-,-0.12,0.02,2.89 +Manipur,2.17,1.61,-,0.26,0.29,0.08,-,-,-,0.09 +Meghalaya,1.36,1.38,-,-,-0.02,0.04,-,-0.05,-,0.03 +Mizoram,1.17,0.46,-,0.27,0.43,0.11,-,-,-,0.03 +Nagaland,2.99,2.6,-,-,0.39,0.24,-,-,-,0.04 +Odisha,34.04,27.58,-,4.4,2.06,0.56,-,0.66,-,0.2 +Punjab,19.18,10.93,-,1.03,7.23,0.17,-,0.71,5.9,0.46 +Rajasthan,36.77,28.63,-,4.99,3.16,0.57,-,1.64,-,0.81 +Sikkim,0.16,-,-,-,0.16,0.03,-,-,-,0.01 +Tamil Nadu,34.11,31.41,-,-,2.7,1.3,-,0.6,-,0.68 +Tripura,2.3,1.89,-,-,0.41,0.41,-,-0.05,-,0.02 +Uttaranchal,-,-,-,-,-,-,-,-,-,- +Uttar Pradesh,80.62,74.89,-,4.34,1.34,0.6,-,-0.21,0.18,0.03 +West Bengal,34.23,32.19,-,-,2.04,0.77,-,0.06,-,0.51 +NCT Delhi,-,-,-,-,-,-,-,-,-,- +ALL STATES,513.38,436.02,-,25.57,51.06,14.18,-,8.21,11.83,11.08 diff --git a/docs/benchmark/stream/superscript/superscript.pdf b/docs/benchmark/stream/superscript/superscript.pdf new file mode 100755 index 0000000..855a3bd Binary files /dev/null and b/docs/benchmark/stream/superscript/superscript.pdf differ diff --git a/docs/benchmark/stream/us-007/us-007-data-camelot-page-1-table-1.csv b/docs/benchmark/stream/us-007/us-007-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..13b1dbc --- /dev/null +++ b/docs/benchmark/stream/us-007/us-007-data-camelot-page-1-table-1.csv @@ -0,0 +1,11 @@ +"","One Withholding" +"Payroll Period","Allowance" +"Weekly","$71.15" +"Biweekly","142.31" +"Semimonthly","154.17" +"Monthly","308.33" +"Quarterly","925.00" +"Semiannually","1,850.00" +"Annually","3,700.00" +"Daily or Miscellaneous","14.23" +"(each day of the payroll period)","" diff --git a/docs/benchmark/stream/us-007/us-007-data-tabula.csv b/docs/benchmark/stream/us-007/us-007-data-tabula.csv new file mode 100755 index 0000000..d2d3cc5 --- /dev/null +++ b/docs/benchmark/stream/us-007/us-007-data-tabula.csv @@ -0,0 +1,11 @@ +"",One Withholding +Payroll Period,Allowance +Weekly,$ 71.15 +Biweekly,142.31 +Semimonthly,154.17 +Monthly,308.33 +Quarterly,925.00 +Semiannually,"1,850.00" +Annually,"3,700.00" +Daily or Miscellaneous,14.23 +"(each day of the payroll period)", diff --git a/docs/benchmark/stream/us-007/us-007.pdf b/docs/benchmark/stream/us-007/us-007.pdf new file mode 100755 index 0000000..45b3de3 Binary files /dev/null and b/docs/benchmark/stream/us-007/us-007.pdf differ diff --git a/docs/conf.py b/docs/conf.py index 901900a..ff195c8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,12 +12,21 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import os +import sys + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -import os -import sys +# +# sys.path.insert(0, os.path.abspath('..')) + +# Insert Camelot's path into the system. sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('_themes')) + +import camelot + # -- General configuration ------------------------------------------------ @@ -53,15 +62,14 @@ source_suffix = '.rst' master_doc = 'index' # General information about the project. -project = u'camelot' -copyright = u'2016, SocialCops' +project = u'Camelot' +copyright = u'2018, Peeply Private Ltd (Singapore)' author = u'Vinayak Mehta' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -import camelot # The short X.Y version. version = camelot.__version__ # The full version, including alpha/beta/rc tags. @@ -86,7 +94,7 @@ language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -94,13 +102,11 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -# -# add_function_parentheses = True +add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -# -# add_module_names = True +add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. @@ -108,7 +114,7 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = 'flask_theme_support.FlaskyStyle' # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -124,7 +130,6 @@ todo_include_todos = True # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -# html_theme = 'alabaster' # Theme options are theme-specific and customize the look and feel of a theme @@ -135,7 +140,8 @@ html_theme_options = { 'github_user': 'socialcopsdev', 'github_repo': 'camelot', 'github_banner': True, - 'show_related': False + 'show_related': False, + 'note_bg': '#FFF59C' } # Add any paths that contain custom themes here, relative to this directory. @@ -144,7 +150,7 @@ html_theme_options = { # The name for this set of Sphinx documents. # " v documentation" by default. # -# html_title = u'camelot v0.1' +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # @@ -153,13 +159,12 @@ html_theme_options = { # The name of an image file (relative to this directory) to place at the top # of the sidebar. # -html_logo = 'assets/camelot.png' +# html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -# -html_favicon = 'assets/favicon.ico' +html_favicon = '_static/favicon.ico' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -180,12 +185,15 @@ html_static_path = ['_static'] # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -# -# html_use_smartypants = True +html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -# -# html_sidebars = {} +html_sidebars = { + 'index': ['sidebarintro.html', 'relations.html', 'sourcelink.html', + 'searchbox.html', 'hacks.html'], + '**': ['sidebarlogo.html', 'localtoc.html', 'relations.html', + 'sourcelink.html', 'searchbox.html', 'hacks.html'] +} # Additional templates that should be rendered to pages, maps page names to # template names. @@ -205,16 +213,13 @@ html_static_path = ['_static'] # html_split_index = False # If true, links to the reST sources are added to the pages. -# -# html_show_sourcelink = True +html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# -# html_show_sphinx = True +html_show_sphinx = False # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# -# html_show_copyright = True +html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the @@ -244,7 +249,7 @@ html_static_path = ['_static'] # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'camelotdoc' +htmlhelp_basename = 'Camelotdoc' # -- Options for LaTeX output --------------------------------------------- @@ -270,7 +275,7 @@ latex_elements = { # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'camelot.tex', u'camelot Documentation', + (master_doc, 'Camelot.tex', u'Camelot Documentation', u'Vinayak Mehta', 'manual'), ] @@ -312,7 +317,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'camelot', u'camelot Documentation', + (master_doc, 'Camelot', u'Camelot Documentation', [author], 1) ] @@ -327,8 +332,8 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'camelot', u'camelot Documentation', - author, 'camelot', 'One line description of project.', + (master_doc, 'Camelot', u'Camelot Documentation', + author, 'Camelot', 'One line description of project.', 'Miscellaneous'), ] @@ -350,4 +355,7 @@ texinfo_documents = [ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/2': None} +intersphinx_mapping = { + 'https://docs.python.org/2': None, + 'http://pandas.pydata.org/pandas-docs/stable': None +} \ No newline at end of file diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index 928dd33..0000000 --- a/docs/contributing.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _contributing: - -The preferred way to contribute to Camelot is to fork this repository, and then submit a "pull request" (PR): - -1. Create an account on GitHub if you don't already have one. - -2. Fork the project repository: click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub server. - -3. Clone this copy to your local disk. -4. Create a branch to hold your changes:: - - git checkout -b my-feature - - and start making changes. Never work in the `master` branch! - -5. Work on this copy, on your computer, using Git to do the version control. When you’re done editing, do:: - - $ git add modified_files - $ git commit - - to record your changes in Git, then push them to GitHub with:: - - $ git push -u origin my-feature - -Finally, go to the web page of the your fork of the camelot repo, and click ‘Pull request’ to send your changes to the maintainers for review. diff --git a/docs/dev/contributing.rst b/docs/dev/contributing.rst new file mode 100644 index 0000000..cc4815f --- /dev/null +++ b/docs/dev/contributing.rst @@ -0,0 +1,159 @@ +.. _contributing: + +Contributor's Guide +=================== + +If you're reading this, you're probably looking to contributing to Camelot. *Time is the only real currency*, and the fact that you're considering spending some here is *very* generous of you. Thanks you very much! + +This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer. + +.. _Vinayak Mehta: http://vinayak-mehta.github.io + +Code Of Conduct +--------------- + +The following quote sums up the **Code Of Conduct**. + + **Be cordial or be on your way**. *--Kenneth Reitz* + +Kenneth Reitz has also written an `essay`_ on this topic, which you should read. + +.. _essay: https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way + +As the `Requests Code Of Conduct`_ states, **all contributions are welcome**, as long as everyone involved is treated with respect. + +.. _Requests Code Of Conduct: http://docs.python-requests.org/en/master/dev/contributing/#be-cordial + +Your First Contribution +----------------------- + +A great way to start contributing to Camelot is to pick an issue tagged with the `Contributor Friendly`_ or the `Easy`_ tags. If you're unable to find a good first issue, feel free to contact the maintainer. + +.. _Contributor Friendly: https://github.com/socialcopsdev/camelot/labels/Contributor%20Friendly +.. _Easy: https://github.com/socialcopsdev/camelot/labels/Level%3A%20Easy + +Setting up a development environment +------------------------------------ + +To install the dependencies needed for development, you can use pip:: + + $ pip install camelot-py[dev] + +Pull Requests +------------- + +Submit a Pull Request +^^^^^^^^^^^^^^^^^^^^^ + +The preferred workflow for contributing to Camelot is to fork the `project repository`_ on GitHub, clone, develop on a branch and then finally submit a pull request. Steps: + +.. _project repository: https://github.com/socialcopsdev/camelot + +1. Fork the project repository: click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub. + +2. Clone your fork of the Camelot from your GitHub account:: + + $ git clone https://www.github.com/[username]/camelot + +3. Create a branch to hold your changes:: + + $ git checkout -b my-feature + +Always branch out from ``master`` to work on your contribution. It's good practice to never work on the ``master`` branch! + +.. note:: Protip: ``git stash`` is a great way to save the work that you haven't committed yet, to move between branches. + +4. Work on your contribution. Add changed files using ``git add`` and then ``git commit`` them:: + + $ git add modified_files + $ git commit + +5. Finally, push them to your GitHub fork:: + + $ git push -u origin my-feature + +Now it's time to go to the your fork of Camelot and create a pull request! You can `follow these instructions`_ to do the same. + +.. _follow these instructions: https://help.github.com/articles/creating-a-pull-request-from-a-fork/ + +Work on your Pull Request +^^^^^^^^^^^^^^^^^^^^^^^^^ + +We recommend that your pull request complies with the following guidelines: + +- Make sure your code follows `pep8`_. + +.. _pep8: http://pep8.org + +- In case your pull request contains function docstrings, make sure you follow the `numpydoc`_ format. All function docstrings in Camelot follow this format. Following the format will make sure that the API documentation is generated flawlessly. + +.. _numpydoc: https://numpydoc.readthedocs.io/en/latest/format.html + +- Make sure your commit messages follow `the seven rules of a great git commit message`_. + - Separate subject from body with a blank line + - Limit the subject line to 50 characters + - Capitalize the subject line + - Do not end the subject line with a period + - Use the imperative mood in the subject line + - Wrap the body at 72 characters + - Use the body to explain what and why vs. how + +.. _the seven rules of a great git commit message: https://chris.beams.io/posts/git-commit/ + +- Please prefix your title of your pull request with [MRG] (Ready for Merge), if the contribution is complete and ready for a detailed review. An incomplete pull request's title should be prefixed with [WIP] (to indicate a work in progress), and changed to [MRG] when it's complete. A good `task list`_ in the PR description will ensure that other people get a fair idea of what it proposes to do, which will also increase collaboration. + +.. _task list: https://blog.github.com/2013-01-09-task-lists-in-gfm-issues-pulls-comments/ + +- If contributing new functionality, make sure that you add a unit test for it, while making sure that all previous tests pass. Camelot uses `pytest`_ for testing. Tests can be run using: + +.. _pytest: https://docs.pytest.org/en/latest/ + +:: + + $ python setup.py test + +Writing Documentation +--------------------- + +Writing documentation, function docstrings, examples and tutorials is a great way to start contributing to open-source software! The documentation is present inside the ``docs/`` directory of the source code repository. + +The documentation is written in `reStructuredText`_, with `Sphinx`_ used to generate these lovely HTML files that you're currently reading (unless you're reading this on GitHub). You can edit the documentation using any text editor and then generate the HTML output by running `make html` in the ``docs/`` directory. + +The function docstrings are written using the `numpydoc`_ extension for Sphinx. Make sure you check out how its format guidelines, before you start writing one. + +.. _reStructuredText: https://en.wikipedia.org/wiki/ReStructuredText +.. _Sphinx: http://www.sphinx-doc.org/en/master/ +.. _numpydoc: https://numpydoc.readthedocs.io/en/latest/format.html + +Filing Issues +------------- + +We use `GitHub issues`_ to keep track of all issues and pull requests. Before opening an issue (which asks a question or reports a bug), it is advisable to use GitHub search to look for existing issues (both open and closed) that may be similar. + +.. _GitHub issues: https://docs.pytest.org/en/latest/ + +Questions +^^^^^^^^^ + +Please don't use GitHub issues for support questions, a better place for them would be `Stack Overflow`_. Make sure you tag them using the ``python-camelot`` tag. + +.. _Stack Overflow: http://stackoverflow.com + +Bug Reports +^^^^^^^^^^^ + +- Please include your operating system type and Python version number, along with the version numbers of NumPy, OpenCV and Camelot. You can use the following code snippet to find this information:: + + import platform; print(platform.platform()) + import sys; print('Python', sys.version) + import numpy; print('NumPy', numpy.__version__) + import cv2; print('OpenCV', cv2.__version__) + import camelot; print('Camelot', camelot.__version__) + +- Please include the **complete traceback** in your bug report. + +- Make sure you include **steps to reproduce the bug**, using code snippets. See `Creating and highlighting code blocks`_. + +.. _Creating and highlighting code blocks: https://help.github.com/articles/creating-and-highlighting-code-blocks/ + +- Also include a link to the PDF document that you were trying to extract tables from, telling us what you expected the code to do and what actually happened. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 4b91c69..9206925 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,135 +1,105 @@ -.. camelot documentation master file, created by +.. Camelot documentation master file, created by sphinx-quickstart on Tue Jul 19 13:44:18 2016. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -===================================== Camelot: PDF Table Parsing for Humans ===================================== -Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files. +Release v\ |version|. (:ref:`Installation `) -Why another pdf table parsing library? -====================================== +.. image:: https://img.shields.io/badge/license-MIT-lightgrey.svg + :target: https://pypi.org/project/camelot-py/ -We tried a lot of tools available online to parse tables from pdf files. `PDFTables`_, `SolidConverter`_ are closed source, commercial products and a free trial doesn't last forever. `Tabula`_, which is open source, isn't very scalable. We found nothing that gave us complete control over the parsing process. In most cases, we didn't get the correct output and had to resort to writing custom scripts for each type of pdf. +.. image:: https://img.shields.io/badge/python-2.7-blue.svg + :target: https://pypi.org/project/camelot-py/ -.. _PDFTables: https://pdftables.com/ -.. _SolidConverter: http://www.soliddocuments.com/pdf/-to-word-converter/304/1 -.. _Tabula: http://tabula.technology/ +**Camelot** is a Python library which makes it easy for *anyone* to extract tables from PDF files! -Some background -=============== +.. note:: Camelot only works with: -PDF started as `The Camelot Project`_ when people wanted a cross-platform way for sending and viewing documents. A pdf file contains characters placed at specific x,y-coordinates. Spaces are simulated by placing characters relatively far apart. + - Python 2, with **Python 3** support `on the way`_. + - Text-based PDFs and not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer, then your PDF is text-based. Support for image-based PDFs using **OCR** is `planned`_. -Camelot uses two methods to parse tables from PDFs, :doc:`lattice ` and :doc:`stream `. The names were taken from Tabula but the implementation is somewhat different, though it follows the same philosophy. Lattice looks for lines between text elements while stream looks for whitespace between text elements. +.. _on the way: https://github.com/socialcopsdev/camelot/issues/81 +.. _planned: https://github.com/socialcopsdev/camelot/issues/101 -.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf +------------------------ -Usage -===== +**Here's how you can extract tables from PDF files.** Check out the PDF used in this example, `here`_. + +.. _here: _static/pdf/foo.pdf :: >>> import camelot - >>> tables = camelot.read_pdf("foo.pdf") + >>> tables = camelot.read_pdf('foo.pdf', mesh=True) >>> tables - - >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html + + >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html >>> tables[0] - - >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html +
>>> tables[0].parsing_report { - "accuracy": 96, - "whitespace": 80, - "order": 1, - "page": 1 + 'accuracy': 99.02, + 'whitespace': 12.24, + 'order': 1, + 'page': 1 } - >>> df = tables[0].df + >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html + >>> tables[0].df # get a pandas DataFrame! .. csv-table:: - :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" + :file: _static/csv/foo.csv - "","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" - "2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" - "2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" - "4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" - "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" - "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" +There's a :ref:`command-line interface ` too! -Installation -============ - -Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by:: - - pip install -U pip setuptools - -The dependencies include `tk`_ and `ghostscript`_. - -.. _tk: https://wiki.tcl.tk/3743 -.. _ghostscript: https://www.ghostscript.com/ - -Installing dependencies ------------------------ - -tk and ghostscript can be installed using your system's default package manager. - -Linux -^^^^^ - -* Ubuntu - -:: - - sudo apt-get install python-opencv python-tk ghostscript - -* Arch Linux - -:: - - sudo pacman -S opencv tk ghostscript - -OS X -^^^^ - -:: - - brew install homebrew/science/opencv ghostscript - -Finally, `cd` into the project directory and install by:: - - python setup.py install - -API Reference -============= - -See :doc:`API doc `. - -Development -=========== - -Code ----- - -You can check the latest sources with the command:: - - git clone https://github.com/socialcopsdev/camelot.git - -Contributing +Why Camelot? ------------ -See :doc:`Contributing guidelines `. +- **You are in control**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (Since everything in the real world, including PDF table extraction, is fuzzy.) +- **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table. +- Each table is a **pandas DataFrame**, which enables seamless integration into data analysis workflows. +- **Export** to multiple formats, including json, excel and html. +- Simple and Elegant API, written in **Python**! -Testing -------- +See `comparison with other PDF parsing libraries and tools`_. -:: +.. _comparison with other PDF parsing libraries and tools: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools - python setup.py test +The User Guide +-------------- -License -======= +This part of the documentation, begins with some background information about why Camelot was created, takes a small dip into the implementation details and then focuses on step-by-step instructions for getting the most out of Camelot. -BSD License \ No newline at end of file +.. toctree:: + :maxdepth: 2 + + user/intro + user/install + user/how-it-works + user/quickstart + user/advanced + user/cli + +The API Documentation / Guide +----------------------------- + +If you are looking for information on a specific function, class, or method, +this part of the documentation is for you. + +.. toctree:: + :maxdepth: 2 + + api + +The Contributor Guide +--------------------- + +If you want to contribute to the project, this part of the documentation is for +you. + +.. toctree:: + :maxdepth: 2 + + dev/contributing \ No newline at end of file diff --git a/docs/lattice.rst b/docs/lattice.rst deleted file mode 100644 index 0d89be9..0000000 --- a/docs/lattice.rst +++ /dev/null @@ -1,193 +0,0 @@ -.. _lattice: - -======= -Lattice -======= - -Lattice method is designed to work on pdf files which have tables with well-defined grids. It looks for lines on a page to form a table. - -Lattice uses OpenCV to apply a set of morphological transformations (erosion and dilation) to find horizontal and vertical line segments in a pdf page after converting it to an image using imagemagick. - -.. note:: Currently, Lattice only works on pdf files that contain text. However, we plan to add `OCR support`_ in the future. - -.. _OCR support: https://github.com/socialcopsdev/camelot/issues/14 - -Let's see how Lattice processes this pdf, step by step. - -Line segments are detected in the first step. - -.. .. _this: insert link for us-030.pdf - -.. image:: assets/line.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -The detected line segments are overlapped by `and` ing their pixel intensities to find intersections. - -.. image:: assets/intersection.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -The detected line segments are overlapped again, this time by `or` ing their pixel intensities and outermost contours are computed to identify potential table boundaries. This helps Lattice in detecting more than one table on a single page. - -.. image:: assets/contour.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -Since dimensions of a pdf and its image vary; table contours, intersections and segments are scaled and translated to the pdf's coordinate space. A representation of the table is then created using these scaled coordinates. - -.. image:: assets/table.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -Spanning cells are then detected using the line segments and intersections. - -.. image:: assets/table_span.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -Finally, the characters found on the page are assigned to cells based on their x,y coordinates. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice - - >>> manager = Pdf(Lattice(), 'us-030.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. csv-table:: - :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" - - "","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" - "2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" - "2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" - "4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" - "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" - "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" - -Scale ------ - -The scale parameter is used to determine the length of the structuring element used for morphological transformations. The length of vertical and horizontal structuring elements are found by dividing the image's height and width respectively, by `scale`. Large `scale` will lead to a smaller structuring element, which means that smaller lines will be detected. The default value for scale is 15. - -Let's consider this pdf file. - -.. .. _this: insert link for row_span_1.pdf - -.. image:: assets/scale_1.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -Clearly, it couldn't detected those small lines in the lower left part. Therefore, we need to increase the value of scale. Let's try a value of 40. - -.. image:: assets/scale_2.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -Voila! It detected the smaller lines. - -Fill ----- - -In the file used above, you can see that some cells spanned a lot of rows, `fill` just copies the same value to all rows/columns of a spanning cell. You can apply fill horizontally, vertically or both. Let us fill the output for the file we used above, vertically. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice - - >>> manager = Pdf(Lattice(fill=['v'], scale=40), 'row_span_1.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. csv-table:: - :header: "Plan Type","County","Plan Name","Totals" - - "GMC","Sacramento","Anthem Blue Cross","164,380" - "GMC","Sacramento","Health Net","126,547" - "GMC","Sacramento","Kaiser Foundation","74,620" - "GMC","Sacramento","Molina Healthcare","59,989" - "GMC","San Diego","Care 1st Health Plan","71,831" - "GMC","San Diego","Community...","264,639" - "GMC","San Diego","Health Net","72,404" - "GMC","San Diego","Kaiser","50,415" - "GMC","San Diego","Molina Healthcare","206,430" - "GMC","Total GMC...","","1,091,255" - "COHS","Marin","Partnership Health...","36,006" - "COHS","Mendocino","Partnership Health...","37,243" - "COHS","Napa","Partnership Health...","28,398" - "COHS","Solano","Partnership Health...","113,220" - "COHS","Sonoma","Partnership Health...","112,271" - "COHS","Yolo","Partnership Health...","52,674" - "COHS","Del Norte","Partnership Health...","11,242" - "COHS","Humboldt","Partnership Health...","49,911" - "COHS","Lake","Partnership Health...","29,149" - "COHS","Lassen","Partnership Health...","7,360" - "COHS","Modoc","Partnership Health...","2,940" - "COHS","Shasta","Partnership Health...","61,763" - "COHS","Siskiyou","Partnership Health...","16,715" - "COHS","Trinity","Partnership Health...","4,542" - "COHS","Merced","Central California...","123,907" - "COHS","Monterey","Central California...","147,397" - "COHS","Santa Cruz","Central California...","69,458" - "COHS","Santa Barbara","CenCal","117,609" - "COHS","San Luis Obispo","CenCal","55,761" - "COHS","Orange","CalOptima","783,079" - "COHS","San Mateo","Health Plan...","113,202" - "COHS","Ventura","Gold Coast...","202,217" - "COHS","Total COHS...","","2,176,064" - "Subtotal for...","","","10,132,022" - "PCCM","Los Angeles","AIDS Healthcare...","828" - "PCCM","San Francisco","Family Mosaic","25" - "PCCM","Total PHP...","","853" - "All Models...","","","10,132,875" - "Source: Data...","","","" - -Invert ------- - -To find line segments, Lattice needs the lines of the pdf file to be in foreground. So, if you encounter a file like this, just set invert to True. - -.. .. _this: insert link for lines_in_background_1.pdf - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice - - >>> manager = Pdf(Lattice(invert=True), 'lines_in_background_1.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. csv-table:: - :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" - - "Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000" - "Rajasthan","2.12.2009 to 19.12.2009","","","","","","" - "Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453" - "Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153" - "Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183" - "Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855" - "Total","","47","92","11.81","22,455","19,584","10,644" - -Lattice can also parse pdf files with tables like these that are rotated clockwise/anti-clockwise by 90 degrees. - -.. .. _these: insert link for left_rotated_table.pdf - -You can call Lattice with debug={'line', 'intersection', 'contour', 'table'}, and call `debug_plot()` which will generate an image like the ones on this page, with the help of which you can modify various parameters. See :doc:`API doc ` for more information. diff --git a/docs/stream.rst b/docs/stream.rst deleted file mode 100644 index dd1aa1e..0000000 --- a/docs/stream.rst +++ /dev/null @@ -1,133 +0,0 @@ -.. _stream: - -====== -Stream -====== - -Stream method is the complete opposite of Lattice and works on pdf files which have text placed uniformly apart across rows to simulate a table. It looks for spaces between text to form a table representation. - -Stream builds on top of PDFMiner's functionality of grouping characters on a page into words and sentences. After getting these words, it groups them into rows based on their y-coordinates and tries to guess the number of columns a pdf table might have by calculating the mode of the number of words in each row. Additionally, the user can specify the number of columns or column x-coordinates. - -Let's run it on this pdf. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.stream import Stream - - >>> manager = Pdf(Stream(), 'eu-027.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. .. _this: insert link for eu-027.pdf - -.. csv-table:: - - "C","Appendix C:...","","","" - "","Table C1:...","","","" - "","This table...","","","" - "Variable","Mean","Std. Dev.","Min","Max" - "Age","50.8","15.9","21","90" - "Men","0.47","0.50","0","1" - "East","0.28","0.45","0","1" - "Rural","0.15","0.36","0","1" - "Married","0.57","0.50","0","1" - "Single","0.21","0.40","0","1" - "Divorced","0.13","0.33","0","1" - "Widowed","0.08","0.26","0","1" - "Separated","0.03","0.16","0","1" - "Partner","0.65","0.48","0","1" - "Employed","0.55","0.50","0","1" - "Fulltime","0.34","0.47","0","1" - "Parttime","0.20","0.40","0","1" - "Unemployed","0.08","0.28","0","1" - "Homemaker","0.19","0.40","0","1" - "Retired","0.28","0.45","0","1" - "Household size","2.43","1.22","1","9" - "Households...","0.37","0.48","0","1" - "Number of...","1.67","1.38","0","8" - "Lower...","0.08","0.27","0","1" - "Upper...","0.60","0.49","0","1" - "Post...","0.12","0.33","0","1" - "First...","0.17","0.38","0","1" - "Other...","0.03","0.17","0","1" - "Household...","2,127","1,389","22","22,500" - "Gross...","187,281","384,198","0","7,720,000" - "Gross...","38,855","114,128","0","2,870,000" - "","Source:...","","","" - "","","","","ECB" - "","","","","Working..." - "","","","","Febuary..." - -We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this pdf file. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.stream import Stream - - >>> manager = Pdf(Stream(debug=True), 'mexican_towns.pdf'), debug=True - >>> manager.debug_plot() - -.. image:: assets/columns.png - :height: 674 - :width: 1366 - :scale: 50% - :align: left - -After getting the x-coordinates, we just need to pass them to Stream, like this. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.stream import Stream - - >>> manager = Pdf(Stream(columns=['28,67,180,230,425,475,700']), 'mexican_towns.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. csv-table:: - - "Clave","","Clave","","Clave","" - "","Nombre Entidad","","Nombre Municipio","","Nombre Localidad" - "Entidad","","Municipio","","Localidad","" - "01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita" - "01","Aguascalientes","001","Aguascalientes","0096","Agua Azul" - "01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre" - "01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]" - "01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)" - "01","Aguascalientes","001","Aguascalientes","0106","Arellano" - "01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez" - "01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro" - "01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]" - "01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas" - "01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)" - "01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina" - "01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]" - "01","Aguascalientes","001","Aguascalientes","0127","Los Caños" - "01","Aguascalientes","001","Aguascalientes","0128","El Cariñán" - "01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]" - "01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)" - "01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)" - "01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)" - "01","Aguascalientes","001","Aguascalientes","0141","Cobos" - "01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)" - "01","Aguascalientes","001","Aguascalientes","0146","El Conejal" - "01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo" - "01","Aguascalientes","001","Aguascalientes","0162","Coyotes" - "01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)" - "01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)" - "01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)" - "01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]" - "01","Aguascalientes","001","Aguascalientes","0176","La Chiripa" - "01","Aguascalientes","001","Aguascalientes","0182","Dolores" - "01","Aguascalientes","001","Aguascalientes","0183","Los Dolores" - "01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo" - "01","Aguascalientes","001","Aguascalientes","0191","Los Durón" - "01","Aguascalientes","001","Aguascalientes","0197","La Escondida" - "01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]" - "01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo" - "01","Aguascalientes","001","Aguascalientes","0209","La Fortuna" - "01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín" - "01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]" - "01","Aguascalientes","001","Aguascalientes","0216","La Gloria" diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst new file mode 100644 index 0000000..9f3f127 --- /dev/null +++ b/docs/user/advanced.rst @@ -0,0 +1,410 @@ +.. _advanced: + +Advanced Usage +============== + +This page covers some of the more advanced configurations for :ref:`Stream ` and :ref:`Lattice `. + +Process background lines +------------------------ + +To detect line segments, :ref:`Lattice ` needs the lines that make the table, to be in foreground. Here's an example of a table with lines in background. + +.. figure:: ../_static/png/background_lines.png + :scale: 50% + :alt: A table with lines in background + :align: left + +Source: `PDF <../_static/pdf/background_lines.pdf>`__ + +To process background lines, you can pass ``process_background=True``. + +:: + + >>> tables = camelot.read_pdf('background_lines.pdf', mesh=True, process_background=True) + >>> tables[1].df + +.. csv-table:: + :file: ../_static/csv/background_lines.csv + +Plot geometry +------------- + +You can use the :meth:`plot_geometry() ` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. + +The following geometries are available for plotting. You can pass them to the :meth:`plot_geometry() ` method with the ``geometry_type`` keyword argument, which will then generate a `matplotlib `_ plot. + +- 'text' +- 'table' +- 'contour' +- 'line' +- 'joint' + +.. note:: The last three geometries can only be used with :ref:`Lattice `, i.e. when ``mesh=True``. + +Let's generate a plot for each geometry using this `PDF <_static/pdf/foo.pdf>`__ as an example. + +.. warning:: By default, :meth:`plot_geometry() ` will use the first page of the PDF. Since this method is useful only for debugging, it makes sense to use it for one page at a time. If you pass a page range to this method, multiple plots will be generated one by one, each popping up as you close the previous one. To abort, you can use ``Ctrl + C``. + +.. _geometry_text: + +text +^^^^ + +Passing ``geometry_type=text`` creates a plot for all the text present on a PDF page. + +:: + + >>> camelot.plot_geometry('foo.pdf', geometry_type='text') + +.. figure:: ../_static/png/geometry_text.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of all text on a PDF page + :align: left + +This, as we shall later see, is very helpful with :ref:`Stream `, for noting table areas and column separators, in case Stream cannot guess them correctly. + +.. note:: As you can see in the image above, the *x-y* coordinates change as you move your mouse cursor, which can help you note coordinates. + +.. _geometry_table: + +table +^^^^^ + +Passing ``geometry_type=text`` creates a plot for tables detected on a PDF page. This geometry, along with contour, line and joint is useful for debugging and improving the parsing output, as we shall see later. + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='table') + +.. figure:: ../_static/png/geometry_table.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of all tables on a PDF page + :align: left + +.. _geometry_contour: + +contour +^^^^^^^ + +Passing ``geometry_type=text`` creates a plot for table boundaries detected on a PDF page. + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='contour') + +.. figure:: ../_static/png/geometry_contour.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of all contours on a PDF page + :align: left + +.. _geometry_line: + +line +^^^^ + +Passing ``geometry_type=text`` creates a plot for lines detected on a PDF page. + +:: + + >>> camelot.plot_geometry('foo.pdf', geometry_type='line') + +.. figure:: ../_static/png/geometry_line.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of all lines on a PDF page + :align: left + +.. _geometry_joint: + +joint +^^^^^ + +Passing ``geometry_type=text`` creates a plot for line intersections detected on a PDF page. + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='joint') + +.. figure:: ../_static/png/geometry_joint.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: A plot of all line intersections on a PDF page + :align: left + +Specify table areas +------------------- + +Since :ref:`Stream ` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as this `PDF <_static/pdf/table_areas.pdf>`__. You can `plot the text `_ on this page and note the left-top and right-bottom coordinates of the table. + +Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `. + +.. _for now: https://github.com/socialcopsdev/camelot/issues/102 + +:: + + >>> tables = camelot.read_pdf('table_areas.pdf', table_areas=['316,499,566,337']) + >>> tables[0].df + +.. csv-table:: + :file: ../_static/csv/table_areas.csv + +Specify column separators +------------------------- + +In cases like this `PDF <_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by `plotting the text `_ on the page. + +You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() `. + +In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and there is a need to specify column separators as well, the length of both lists should be equal, each table area will be mapped to each column separators' string using their indices. + +If you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table (since you can see by looking at the table that Camelot will be able to get it perfectly!), you can pass an empty string for the second table in the column separators' list, like this, ``columns=['10,120,200,400', '']``. + +Let's get back to the *x* coordinates we got from `plotting text `_ that exists on this `PDF <_static/pdf/column_separators.pdf>`__, and get the table out! + +:: + + >>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683']) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "LICENSE","","","","PREMISE","","","","","" + "NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" + "...","...","...","...","...","...","...","...","...","..." + +Ah! Since `PDFMiner `_ merged the strings, "NUMBER", "TYPE" and "DBA NAME", all of them were assigned to the same cell. Let's see how we can fix this in the next section. + +Split text along separators +--------------------------- + +To deal with cases like the output from the previous section, you can pass ``split_text=True`` to :meth:`read_pdf() `, which will split any strings that lie in different cells but have been assigned to the a single cell (as a result of being merged together by `PDFMiner `_). + +:: + + >>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683'], split_text=True) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "LICENSE","","","","PREMISE","","","","","" + "NUMBER","TYPE","DBA NAME","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" + "...","...","...","...","...","...","...","...","...","..." + +Flag superscripts and subscripts +-------------------------------- + +There might be cases where you want to differentiate between the text and superscripts and subscripts, like this `PDF <_static/pdf/superscript.pdf>`_. + +.. figure:: ../_static/png/superscript.png + :alt: A PDF with superscripts + :align: left + +In this case, the text that `other tools`_ return, will be ``24.912``. This is harmless as long as there is that decimal point involved. When it isn't, you'll be left wondering why the results of your data analysis were 10x bigger! + +You can solve this by passing ``flag_size=True``, which will enclose the superscripts and subscripts with ````, based on font size, as shown below. + +.. _other tools: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools + +:: + + >>> tables = camelot.read_pdf('superscript.pdf', flag_size=True) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","...","..." + "Karnataka","22.44","19.59","-","-","2.86","1.22","-","0.89","-","0.69" + "Kerala","29.03","24.912","-","-","4.11","1.77","-","0.48","-","1.45" + "Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" + "...","...","...","...","...","...","...","...","...","...","..." + +Control how text is grouped into rows +------------------------------------- + +You can pass ``row_close_tol=<+int>`` to group the rows closer together, as shown below. + +:: + + >>> tables = camelot.read_pdf('group_rows.pdf') + >>> tables[0].df + +.. csv-table:: + + "Clave","","Clave","","","Clave","" + "","Nombre Entidad","","","Nombre Municipio","","Nombre Localidad" + "Entidad","","Municipio","","","Localidad","" + "01","Aguascalientes","001","Aguascalientes","","0094","Granja Adelita" + "01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" + "01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" + +:: + + >>> tables = camelot.read_pdf('group_rows.pdf', row_close_tol=10) + >>> tables[0].df + +.. csv-table:: + + "Clave","Nombre Entidad","Clave","","Nombre Municipio","Clave","Nombre Localidad" + "Entidad","","Municipio","","","Localidad","" + "01","Aguascalientes","001","Aguascalientes","","0094","Granja Adelita" + "01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" + "01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" + +Detect short lines +------------------ + +There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. + +As you can already guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. + +.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. + +Here's one `PDF <_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. Let's `plot the table `_ for this PDF. + +.. figure:: ../_static/png/short_lines.png + :alt: A PDF table with short lines + :align: left + +:: + + >>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table') + +.. figure:: ../_static/png/short_lines_1.png + :alt: A plot of the PDF table with short lines + :align: left + +Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and `plot the table `_ again. + +:: + + >>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table', line_size_scaling=40) + +.. figure:: ../_static/png/short_lines_2.png + :alt: An improved plot of the PDF table with short lines + :align: left + +Voila! Camelot can now see those lines. Let's using this value in :meth:`read_pdf() ` and get our table. + +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","2400","All ...","","","","" + "Clinical Examination","","","","","","" + "History of morbidity","","","","","","" + "Diet survey","1200","All ...","","","","" + "Blood Pressure #","2400","Men (≥ 18yrs)","10%","95%","20%","1728" + "","","Women (≥ 18 yrs)","","","","1728" + "Fasting blood glucose","2400","Men (≥ 18 yrs)","5%","95%","20%","1825" + "","","Women (≥ 18 yrs)","","","","1825" + "Knowledge &Practices on HTN &DM","2400","Men (≥ 18 yrs)","-","-","-","1728" + "","2400","Women (≥ 18 yrs)","-","-","-","1728" + +Shift text in spanning cells +---------------------------- + +By default, the :ref:`Lattice ` method shifts text in spanning cells, first to the left and then to the top, as you can observe in the output table above. However, this behavior can be changed using the ``shift_text`` keyword argument. Think of it as setting the *gravity* for a table, it decides where the text moves and finally comes to rest. + +``shift_text`` expects a list with one or more characters from the following set: ``('', l', 'r', 't', 'b')``, which are then applied *in order*. The default, as we discussed above, is ``['l', 't']``. + +We'll use the `PDF <_static/pdf/short_lines.pdf>`__ from the previous example. Let's pass ``shift_text=['']``, which basically means that the text will experience weightlessness! (It will remain in place.) + +.. figure:: ../_static/png/short_lines.png + :alt: A PDF table with short lines + :align: left + +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['']) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","","","","","","" + "Clinical Examination","2400","","All ...","","","" + "History of morbidity","","","","","","" + "Diet survey","1200","","All ...","","","" + "","","Men (≥ 18yrs)","","","","1728" + "Blood Pressure #","2400","Women (≥ 18 yrs)","10%","95%","20%","1728" + "","","Men (≥ 18 yrs)","","","","1825" + "Fasting blood glucose","2400","Women (≥ 18 yrs)","5%","95%","20%","1825" + "Knowledge &Practices on HTN &","2400","Men (≥ 18 yrs)","-","-","-","1728" + "DM","2400","Women (≥ 18 yrs)","-","-","-","1728" + +No surprises there, it did remain in place. Let's pass ``shift_text=['r', 'b']``, to set the *gravity* to right-bottom, and move the text in that direction. + +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['r', 'b']) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","","","","","","" + "Clinical Examination","","","","","","" + "History of morbidity","2400","","","","","All ..." + "Diet survey","1200","","","","","All ..." + "","","Men (≥ 18yrs)","","","","1728" + "Blood Pressure #","2400","Women (≥ 18 yrs)","10%","95%","20%","1728" + "","","Men (≥ 18 yrs)","","","","1825" + "Fasting blood glucose","2400","Women (≥ 18 yrs)","5%","95%","20%","1825" + "","2400","Men (≥ 18 yrs)","-","-","-","1728" + "Knowledge &Practices on HTN &DM","2400","Women (≥ 18 yrs)","-","-","-","1728" + +Copy text in spanning cells +--------------------------- + +You can copy text in spanning cells when using :ref:`Lattice `, in either horizontal or vertical direction or both. This behavior is disabled by default. + +``copy_text`` expects a list with one or more characters from the following set: ``('v', 'h')``, which are then applied *in order*. + +Let's try it out on this `PDF <_static/pdf/copy_text.pdf>`__. First, let's check out the output table to see if we need to use any other configuration parameters. + +:: + + >>> tables = camelot.read_pdf('copy_text.pdf', mesh=True) + >>> tables[0].df + +.. csv-table:: + + "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." + "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." + "2","Maharashtra","Beed","i. Dengue & Chikungunya i","11","0","03/01/14","04/01/14","Under control","..." + "3","Odisha","Kalahandi","iii. Food Poisoning","42","0","02/01/14","03/01/14","Under control","..." + "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." + "","","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." + "","","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." + +We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in the vertical direction. This can save you some time by not having to do this in your cleaning script! + +:: + + >>> tables = camelot.read_pdf('copy_text.pdf', mesh=True, copy_text=['v']) + >>> tables[0].df + +.. csv-table:: + + "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." + "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." + "2","Maharashtra","Beed","i. Dengue & Chikungunya i","11","0","03/01/14","04/01/14","Under control","..." + "3","Odisha","Kalahandi","iii. Food Poisoning","42","0","02/01/14","03/01/14","Under control","..." + "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." + "4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." + "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." \ No newline at end of file diff --git a/docs/user/cli.rst b/docs/user/cli.rst new file mode 100644 index 0000000..6998f91 --- /dev/null +++ b/docs/user/cli.rst @@ -0,0 +1,78 @@ +.. _cli: + +Command-line interface +====================== + +Camelot comes with a command-line interface. + +You can print the help for the interface, by typing ``camelot --help`` in your favorite terminal program, as shown below. + +:: + + $ camelot --help + Usage: camelot [OPTIONS] FILEPATH + + Options: + -p, --pages TEXT Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + -o, --output TEXT Output filepath. + -f, --format [csv|json|excel|html] + Output file format. + -z, --zip Whether or not to create a ZIP archive. + -m, --mesh Whether or not to use Lattice method of + parsing. Stream is used by default. + -T, --table_area TEXT Table areas (x1,y1,x2,y2) to process. + x1, y1 + -> left-top and x2, y2 -> right-bottom + -split, --split_text Whether or not to split text if it spans + across multiple cells. + -flag, --flag_size (inactive) Whether or not to flag text which + has uncommon size. (Useful to detect + super/subscripts) + -M, --margins ... + char_margin, line_margin, word_margin for + PDFMiner. + -C, --columns TEXT x-coordinates of column separators. + -r, --row_close_tol INTEGER Rows will be formed by combining text + vertically within this tolerance. + -c, --col_close_tol INTEGER Columns will be formed by combining text + horizontally within this tolerance. + -back, --process_background (with --mesh) Whether or not to process + lines that are in background. + -scale, --line_size_scaling INTEGER + (with --mesh) Factor by which the page + dimensions will be divided to get smallest + length of detected lines. + -copy, --copy_text [h|v] (with --mesh) Specify direction in which + text will be copied over in a spanning cell. + -shift, --shift_text [|l|r|t|b] (with --mesh) Specify direction in which + text in a spanning cell should flow. + -l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to + merge close vertical lines and close + horizontal lines. + -j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to + decide whether the detected lines and points + lie close to each other. + -block, --threshold_blocksize INTEGER + (with --mesh) For adaptive thresholding, + size of a pixel neighborhood that is used to + calculate a threshold value for the pixel: + 3, 5, 7, and so on. + -const, --threshold_constant INTEGER + (with --mesh) For adaptive thresholding, + constant subtracted from the mean or + weighted mean. + Normally, it is positive but + may be zero or negative as well. + -I, --iterations INTEGER (with --mesh) Number of times for + erosion/dilation is applied. + -G, --geometry_type [text|table|contour|joint|line] + Plot geometry found on pdf page for + debugging. + text: Plot text objects. (Useful to get + table_area and columns coordinates) + table: Plot parsed table. + contour (with --mesh): Plot detected rectangles. + joint (with --mesh): Plot detected line intersections. + line (with --mesh): Plot detected lines. + --help Show this message and exit. \ No newline at end of file diff --git a/docs/user/how-it-works.rst b/docs/user/how-it-works.rst new file mode 100644 index 0000000..5fa3507 --- /dev/null +++ b/docs/user/how-it-works.rst @@ -0,0 +1,84 @@ +.. _how_it_works: + +How It Works +============ + +This part of the documentation details a high-level explanation of how Camelot extracts tables from PDF files. + +You can choose between two table parsing methods, *Stream* and *Lattice*. The naming for parsing methods inside Camelot (i.e. Stream and Lattice) was inspired from `Tabula`_. + +.. _Tabula: https://github.com/tabulapdf/tabula + +.. _stream: + +Stream +------ + +Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It looks for these spaces between text to form a table representation. + +It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins`_. After getting the words given on a page, it groups them into rows based on their *y* coordinates and tries to guess the number of columns the table might have by calculating the mode of the number of words in each row. This mode is used to calculate *x* ranges for the table's columns. It then adds columns to this column range list based on any words that may lie outside or inside the current column *x* ranges. + +.. _margins: https://euske.github.io/pdfminer/#tools + +.. note:: By default, Stream treats the whole PDF page as a table. Automatic table detection for Stream is `in the works`_. + +.. _in the works: https://github.com/socialcopsdev/camelot/issues/102 + +.. _lattice: + +Lattice +------- + +Lattice is more deterministic in nature, and does not rely on guesses. It can be used to parse tables that have demarcated lines between cells. + +It starts by converting the PDF page to an image using ghostscript and then processing it to get horizontal and vertical line segments by applying a set of morphological transformations (erosion and dilation) using OpenCV. + +Let's see how Lattice processes the `second page of this PDF`_, step-by-step. + +.. _second page of this PDF: https://github.com/socialcopsdev/camelot/blob/docs/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf + +1. Line segments are detected. + +.. image:: ../_static/png/geometry_line.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +2. Line intersections are detected, by overlapping the detected line segments and "`and`_"ing their pixel intensities. + +.. _and: https://en.wikipedia.org/wiki/Logical_conjunction + +.. image:: ../_static/png/geometry_joint.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +3. Table boundaries are computed, by overlapping the detected line segments again, this time by "`or`_"ing their pixel intensities. + +.. _or: https://en.wikipedia.org/wiki/Logical_disjunction + +.. image:: ../_static/png/geometry_contour.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +4. Since dimensions of the PDF page and its image vary; the detected table boundaries, line intersections and line segments are scaled and translated to the PDF page's coordinate space, and a representation of the table is created. + +.. image:: ../_static/png/table.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +5. Spanning cells are detected using the line segments and line intersections. + +.. image:: ../_static/png/geometry_table.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +6. Finally, the words found on the page are assigned to the table's cells based on their *x* and *y* coordinates. \ No newline at end of file diff --git a/docs/user/install.rst b/docs/user/install.rst new file mode 100644 index 0000000..7552afc --- /dev/null +++ b/docs/user/install.rst @@ -0,0 +1,38 @@ +.. _install: + +Installation of Camelot +======================= + +This part of the documentation covers the installation of Camelot. First, you'll need to install the dependencies, which include `tk`_ and `ghostscript`_. + +.. _tk: https://wiki.tcl.tk/3743 +.. _ghostscript: https://www.ghostscript.com/ + +These can be installed using your system's package manager. If you use Ubuntu, run the following: +:: + + $ sudo apt install python-tk ghostscript + +$ pip install camelot-py +------------------------ + +After installing the dependencies, you can simply use pip to install Camelot: +:: + + $ pip install camelot-py + +Get the Source Code +------------------- + +Alternatively, you can install from source by: + +1. Cloning the GitHub repository. +:: + + $ git clone https://www.github.com/socialcopsdev/camelot + +2. And then simply using pip again. +:: + + $ cd camelot + $ pip install . \ No newline at end of file diff --git a/docs/user/intro.rst b/docs/user/intro.rst new file mode 100644 index 0000000..902b1a2 --- /dev/null +++ b/docs/user/intro.rst @@ -0,0 +1,42 @@ +.. _intro: + +Introduction +============ + +The Camelot Project +------------------- + +The Portable Document Format (PDF) was born out of `The Camelot Project`_ when a need was felt for "a universal to communicate documents across a wide variety of machine configurations, operating systems and communication networks". The goal was to make these documents viewable on any display and printable on any modern printers. The invention of the `PostScript`_ page description language, which enabled the creation of fixed-layout flat documents (with text, fonts, graphics, images encapsulated), solved the problem. + +At a very high level, PostScript defines instructions, such as, "place this character at this x,y coordinate on a plane". Spaces can be *simulated* by placing characters relatively far apart. Similarly, tables can be *simulated* by placing characters (and words) in two-dimensional grids. A PDF viewer just takes these instructions and draws everything for the user to view. Since it's just characters on a plane, there is no table data structure which can be directly extracted and used for analysis! + +Sadly, a lot of open data is given out as tables which are trapped inside PDF files. + +.. _PostScript: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf + +Why another PDF Table Parsing library? +-------------------------------------- + +There are both open (`Tabula`_) and closed-source (`PDFTables`_, `smallpdf`_) tools that are used widely to extract tables from PDF files. They either give nice output, or fail miserably. There is no in-between. This does not help most users, since everything in the real world, including PDF table extraction, is fuzzy. Which leads to creation adhoc table extraction scripts for each different type of PDF that the user wants to parse. + +Camelot was created with the goal of offering its users complete control over table extraction. If the users are not able to the desired output with the default configuration, they should be able to tweak the parameters and get the tables out! + +Here is a `comparison`_ of Camelot's output with outputs from other PDF parsing libraries and tools. + +.. _Tabula: http://tabula.technology/ +.. _PDFTables: https://pdftables.com/ +.. _Smallpdf: https://smallpdf.com +.. _comparison: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools + +What's in a name? +----------------- + +As you can already guess, this library is named after `The Camelot Project`_. The image on the left is taken from `Monty Python and the Holy Grail`_. In the movie, it is the castle "Camelot" where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written was named after Monty Python. + +.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf +.. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail + +Camelot License +--------------- + + .. include:: ../../LICENSE \ No newline at end of file diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst new file mode 100644 index 0000000..c94a28c --- /dev/null +++ b/docs/user/quickstart.rst @@ -0,0 +1,92 @@ +.. _quickstart: + +Quickstart +========== + +In a hurry to extract tables from PDFs? This document gives a good introduction to help you get started with using Camelot. + +Parse a PDF +----------- + +Parsing a PDF to extract tables with Camelot is very simple. + +Begin by importing the Camelot module:: + + >>> import camelot + +Now, let's try to read a PDF. You can check out the PDF used in this example, `here`_. Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice ` method here. To do that we will set the ``mesh`` keyword argument to ``True``. + +.. note:: :ref:`Stream ` is used by default. + +.. _here: _static/pdf/foo.pdf + +:: + + >>> tables = camelot.read_pdf('foo.pdf', mesh=True) + >>> tables + + +Now, we have a :class:`TableList ` object called ``tables``, which is a list of :class:`Table ` objects. We can get everything we need from this object. + +We can access each table using its index. We can see that the ``tables`` object has only one table, since ``n=1``. Let's access the table using the index ``0`` and take a look at its ``shape``. + +:: + + >>> tables[0] +
+ +Let's print the parsing report. + +:: + + >>> print(tables[0].parsing_report) + { + 'accuracy': 99.02, + 'whitespace': 12.24, + 'order': 1, + 'page': 1 + } + +Woah! The accuracy is top-notch and whitespace is less, that means the table was parsed correctly (most probably). You can access the table as a pandas DataFrame by using its ``df``. + +:: + + >>> tables[0].df + +.. csv-table:: + :file: ../_static/csv/foo.csv + +Looks good! You can be export the table as a CSV file using its :meth:`to_csv() ` method. Alternatively you can use :meth:`to_json() `, :meth:`to_excel() ` or :meth:`to_html() ` methods to export the table as JSON, Excel and HTML files respectively. + +:: + + >>> tables[0].to_csv('foo.csv') + +This will export the table as a CSV file at the path specified. In this case, it is ``foo.csv`` in the current directory. + +You can also export all tables at once, using the ``tables`` object's :meth:`export() ` method. + +:: + + >>> tables.export('foo.csv', f='csv') + +This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``. + +.. note:: The :meth:`export() ` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple files will be created. To avoid filling up your path with multiple files, you can use ``compress=True`` to add all exported files to a ZIP archive. + +.. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF file`_. + +.. _this PDF file: ../_static/pdf/rotated.pdf + +Specify page numbers +-------------------- + +By default, Camelot only parses the first page of the PDF. To specify multiple pages, you can use the ``pages`` keyword argument:: + + >>> camelot.read_pdf('your.pdf', pages='1,2,3') + +The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges, for example ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``. + +------------------------ + +Ready for more? Check out the :ref:`advanced ` section. \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index d907a0b..388f751 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,11 +1,3 @@ -click==6.7 -matplotlib==2.2.3 -numpy==1.13.3 -opencv-python==3.4.2.17 -pandas==0.23.4 -pdfminer==20140328 -Pillow==5.2.0 -PyPDF2==1.26.0 pytest==3.8.0 pytest-runner==4.2 -Sphinx==1.8.0b1 \ No newline at end of file +Sphinx==1.7.9 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d1a33b7..a9b0931 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ numpy==1.13.3 opencv-python==3.4.2.17 pandas==0.23.4 pdfminer==20140328 -Pillow==5.2.0 PyPDF2==1.26.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 20f794f..d37bcf2 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,23 @@ +import os +from setuptools import find_packages from pkg_resources import parse_version -import camelot +here = os.path.abspath(os.path.dirname(__file__)) +about = {} +with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f: + exec(f.read(), about) + +# TODO: Move these to __version__.py NAME = 'camelot' -VERSION = camelot.__version__ +VERSION = about['__version__'] DESCRIPTION = 'PDF Table Parsing for Humans' with open('README.md') as f: LONG_DESCRIPTION = f.read() URL = 'https://github.com/socialcopsdev/camelot' AUTHOR = 'Vinayak Mehta' AUTHOR_EMAIL = 'vmehta94@gmail.com' -LICENSE = 'BSD License' - -opencv_min_version = '2.4.8' - - -def get_opencv_status(): - """ - Returns a dictionary containing a boolean specifying whether OpenCV - is up-to-date, along with the version string (empty string if - not installed). - """ - opencv_status = {} - try: - import cv2 - opencv_version = cv2.__version__ - opencv_status['up_to_date'] = parse_version( - opencv_version) >= parse_version(opencv_min_version) - opencv_status['version'] = opencv_version - except ImportError: - opencv_status['up_to_date'] = False - opencv_status['version'] = "" - return opencv_status +LICENSE = 'MIT License' def setup_package(): @@ -40,6 +26,11 @@ def setup_package(): for line in f: reqs.append(line.strip()) + dev_reqs = [] + with open('requirements-dev.txt', 'r') as f: + for line in f: + dev_reqs.append(line.strip()) + metadata = dict(name=NAME, version=VERSION, description=DESCRIPTION, @@ -48,32 +39,30 @@ def setup_package(): author=AUTHOR, author_email=AUTHOR_EMAIL, license=LICENSE, - packages=['camelot'], + packages=find_packages(exclude=('tests',)), install_requires=reqs, + extras_require={ + 'dev': dev_reqs + }, entry_points={ 'console_scripts': [ 'camelot = camelot.cli:cli', ], - }) + }, + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7' + ]) try: from setuptools import setup except: from distutils.core import setup - opencv_status = get_opencv_status() - opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version) - - if opencv_status['up_to_date'] is False: - if opencv_status['version']: - raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}" - .format(opencv_status['version'], opencv_req_str)) - else: - raise ImportError("OpenCV is not installed.\n{}" - .format(opencv_req_str)) - setup(**metadata) if __name__ == '__main__': - setup_package() + setup_package() \ No newline at end of file