diff --git a/README.md b/README.md index fa6d6a1..6ceee80 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,19 @@ # camelot -## Dependencies - -Currently, camelot works under Python 2.7. - -The required dependencies include numpy, opencv, and imagemagick. - -## Install - -Make sure you have the required dependencies installed on your system. If you're working in a virtual environment, copy the `cv2.so` file from your system's site-packages to the virtualenv's site-packages. After that, `cd` into the project directory and issue the following command. - -
-python setup.py install
-
+Camelot is a Python 2.7 library and command-line tool for getting tables out of PDF files. ## Usage
-from camelot import *
+from camelot.pdf import Pdf
+from camelot.lattice import Lattice
 
 extractor = Lattice(Pdf("/path/to/pdf", pagenos=[{'start': 2, 'end': 4}]))
 tables = extractor.get_tables()
 
+Camelot comes with a command-line tool in which you can specify the output format (csv, tsv, html, json, and xlsx), page numbers you want to parse and the output directory in which you want the output files to be placed. By default, the output files are placed in the same directory as the PDF. +
 camelot parses tables from PDFs!
 
@@ -45,6 +36,92 @@ camelot methods:
 See 'camelot  -h' for more information on a specific method.
 
+## Dependencies + +Currently, camelot works under Python 2.7. + +The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php). + +## Installation + +Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by + +
+pip install -U pip, setuptools
+
+ +We strongly recommend that you use a [virtual environment](http://virtualenvwrapper.readthedocs.io/en/latest/install.html#basic-installation) to install Camelot. If you don't want to use a virtual environment, then skip the next section. + +### Installing virtualenvwrapper + +You'll need to install [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/). + +
+pip install virtualenvwrapper
+
+ +or +
+sudo pip install virtualenvwrapper
+
+ +After installing virtualenvwrapper, add the following lines to your `.bashrc` and source it. + +
+export WORKON_HOME=$HOME/.virtualenvs
+source /usr/bin/virtualenvwrapper.sh
+
+ +The path to `virtualenvwrapper.sh` could be different on your system. + +Finally make a virtual environment using + +
+mkvirtualenv camelot
+
+ +### Installing dependencies + +numpy can be install using pip. + +
+pip install numpy
+
+ +OpenCV and imagemagick can be installed using your system's default package manager. + +#### Linux + +* Arch Linux + +
+sudo pacman -S opencv imagemagick
+
+ +* Ubuntu + +
+sudo apt-get install libopencv-dev python-opencv imagemagick
+
+ +#### OS X + +
+brew install homebrew/science/opencv imagemagick
+
+ +If you're working in a virtualenv, you'll need to create a symbolic link for the OpenCV shared object file + +
+sudo ln -s /path/to/system/site-packages/cv2.so ~/path/to/virtualenv/site-packages/cv2.so
+
+ +Finally, `cd` into the project directory and install by doing + +
+make install
+
+ ## Development ### Code @@ -57,28 +134,14 @@ git clone https://github.com/socialcopsdev/camelot.git ### Contributing -The preferred way to contribute to camelot is to fork this repository, and then submit a "pull request" (PR): - -1. Create an account on GitHub if you don't already have one. -2. Fork the project repository: click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub server. -3. Clone this copy to your local disk. -4. Create a branch to hold your changes: -
-git checkout -b my-feature
-
-and start making changes. Never work in the `master` branch! -5. Work on this copy, on your computer, using Git to do the version control. When you’re done editing, do: -
-$ git add modified_files
-$ git commit
-
-to record your changes in Git, then push them to GitHub with: -
-$ git push -u origin my-feature
-
- -Finally, go to the web page of the your fork of the camelot repo, and click ‘Pull request’ to send your changes to the maintainers for review. +See [Contributing doc](). ### Testing +
+make test
+
+ ## License + +BSD License \ No newline at end of file diff --git a/camelot/lattice.py b/camelot/lattice.py index 9f0b419..041b296 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -129,9 +129,9 @@ class Lattice: ---------- pdfobject : camelot.pdf.Pdf - fill : None, 'h', 'v', 'hv' + fill : string Fill data in horizontal and/or vertical spanning - cells. (optional, default: None) + cells. (optional, default: None) {None, 'h', 'v', 'hv'} scale : int Scaling factor. Large scaling factor leads to smaller lines @@ -149,9 +149,9 @@ class Lattice: Invert pdf image to make sure that lines are in foreground. (optional, default: False) - debug : 'contour', 'line', 'joint', 'table' + debug : string Debug by visualizing pdf geometry. - (optional, default: None) + (optional, default: None) {'contour', 'line', 'joint', 'table'} Attributes ---------- diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..99a9e7f --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,20 @@ +.. _api: + +============= +API Reference +============= + +Pdf +=== +.. automodule:: camelot.pdf + :members: + +Lattice +======= +.. automodule:: camelot.lattice + :members: + +Stream +====== +.. automodule:: camelot.stream + :members: \ No newline at end of file diff --git a/docs/assets/columns.png b/docs/assets/columns.png new file mode 100644 index 0000000..760d110 Binary files /dev/null and b/docs/assets/columns.png differ diff --git a/docs/assets/contour.png b/docs/assets/contour.png new file mode 100644 index 0000000..853c179 Binary files /dev/null and b/docs/assets/contour.png differ diff --git a/docs/assets/intersection.png b/docs/assets/intersection.png new file mode 100644 index 0000000..80481ad Binary files /dev/null and b/docs/assets/intersection.png differ diff --git a/docs/assets/line.png b/docs/assets/line.png new file mode 100644 index 0000000..e5fba60 Binary files /dev/null and b/docs/assets/line.png differ diff --git a/docs/assets/scale_1.png b/docs/assets/scale_1.png new file mode 100644 index 0000000..e9023e0 Binary files /dev/null and b/docs/assets/scale_1.png differ diff --git a/docs/assets/scale_2.png b/docs/assets/scale_2.png new file mode 100644 index 0000000..798fd2a Binary files /dev/null and b/docs/assets/scale_2.png differ diff --git a/docs/assets/table.png b/docs/assets/table.png new file mode 100644 index 0000000..0d3dced Binary files /dev/null and b/docs/assets/table.png differ diff --git a/docs/assets/table_span.png b/docs/assets/table_span.png new file mode 100644 index 0000000..06684d8 Binary files /dev/null and b/docs/assets/table_span.png differ diff --git a/docs/conf.py b/docs/conf.py index 1dcf949..16c611b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,6 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# import os import sys sys.path.insert(0, os.path.abspath('..')) @@ -32,12 +31,9 @@ sys.path.insert(0, os.path.abspath('..')) extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', - 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', - 'sphinx.ext.coverage', 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', ] # Add any paths that contain templates here, relative to this directory. @@ -134,8 +130,13 @@ html_theme = 'alabaster' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# -# html_theme_options = {} +html_theme_options = { + 'show_powered_by': False, + 'github_user': 'socialcopsdev', + 'github_repo': 'camelot', + 'github_banner': True, + 'show_related': False +} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..928dd33 --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,25 @@ +.. _contributing: + +The preferred way to contribute to Camelot is to fork this repository, and then submit a "pull request" (PR): + +1. Create an account on GitHub if you don't already have one. + +2. Fork the project repository: click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub server. + +3. Clone this copy to your local disk. +4. Create a branch to hold your changes:: + + git checkout -b my-feature + + and start making changes. Never work in the `master` branch! + +5. Work on this copy, on your computer, using Git to do the version control. When you’re done editing, do:: + + $ git add modified_files + $ git commit + + to record your changes in Git, then push them to GitHub with:: + + $ git push -u origin my-feature + +Finally, go to the web page of the your fork of the camelot repo, and click ‘Pull request’ to send your changes to the maintainers for review. diff --git a/docs/index.rst b/docs/index.rst index 633973d..d22bba9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,20 +3,193 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -camelot: Parse tables from PDFs! -================================ +================================== +Camelot: PDF parsing made simpler! +================================== -Contents: +Camelot is a Python 2.7 library and command-line tool for getting tables out of PDF files. -.. toctree:: - :maxdepth: 2 +Why another PDF table parsing library? +====================================== +We tried a lot of tools available online to get tables out of PDFs, but each one had its limitations. `PDFTables`_ stopped its open source development in 2013. `SolidConverter`_ which powers `Smallpdf`_ is closed source. Recently, `Docparser`_ was launched, which again is closed source. `Tabula`_, though being open source, doesn't always give correct output. In most cases, we had to resort to writing custom scripts for each type of PDF. +.. _PDFTables: https://pdftables.com/ +.. _SolidConverter: http://www.soliddocuments.com/pdf/-to-word-converter/304/1 +.. _Smallpdf: smallpdf.com +.. _Docparser: https://docparser.com/ +.. _Tabula: http://tabula.technology/ -Indices and tables -================== +PDFs have feelings too +====================== -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` +PDF started as `The Camelot Project`_ when people wanted a cross-platform way to share documents, since a document looked different on each system. A PDF contains characters placed at specific x,y-coordinates. Spaces are simulated by placing characters relatively far apart. +Camelot uses two methods to parse tables from PDFs, :doc:`lattice ` and :doc:`stream `. The names were taken from Tabula but the implementation is somewhat different, though it follows the same philosophy. Lattice looks for lines between text elements while stream looks for whitespace between text elements. + +.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf + +Usage +===== + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.lattice import Lattice + + >>> extractor = Lattice(Pdf('us-030.pdf')) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" + + "","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" + "2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" + "2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" + "4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" + "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" + "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" + +Camelot comes with a command-line tool in which you can specify the output format (csv, tsv, html, json, and xlsx), page numbers you want to parse and the output directory in which you want the output files to be placed. By default, the output files are placed in the same directory as the PDF. + +:: + + Camelot: PDF parsing made simpler! + + usage: + camelot [options] [...] + + options: + -h, --help Show this screen. + -v, --version Show version. + -p, --pages Comma-separated list of page numbers. + Example: -p 1,3-6,10 [default: 1] + -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] + -l, --log Print log to file. + -o, --output Output directory. + + camelot methods: + lattice Looks for lines between data. + stream Looks for spaces between data. + + See 'camelot -h' for more information on a specific method. + +Installation +============ + +Make sure you have the most updated versions for `pip` and `setuptools`. You can update them by:: + + pip install -U pip, setuptools + +The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_. + +.. _numpy: http://www.numpy.org/ +.. _OpenCV: http://opencv.org/ +.. _ImageMagick: http://www.imagemagick.org/script/index.php + +We strongly recommend that you use a `virtual environment`_ to install Camelot. If you don't want to use a virtual environment, then skip the next section. + +Installing virtualenvwrapper +---------------------------- + +You'll need to install `virtualenvwrapper`_. + +:: + + pip install virtualenvwrapper + +or + +:: + + sudo pip install virtualenvwrapper + +After installing virtualenvwrapper, add the following lines to your `.bashrc` and source it. + +:: + + export WORKON_HOME=$HOME/.virtualenvs + source /usr/bin/virtualenvwrapper.sh + +.. note:: The path to `virtualenvwrapper.sh` could be different on your system. + +Finally make a virtual environment using:: + + mkvirtualenv camelot + +Installing dependencies +----------------------- + +`numpy` can be install using `pip`. + +:: + + pip install numpy + +`OpenCV` and `imagemagick` can be installed using your system's default package manager. + +Linux +^^^^^ + +* Arch Linux + +:: + + sudo pacman -S opencv imagemagick + +* Ubuntu + +:: + + sudo apt-get install libopencv-dev python-opencv imagemagick + +OS X +^^^^ + +:: + + brew install homebrew/science/opencv imagemagick + +If you're working in a virtualenv, you'll need to create a symbolic link for the OpenCV shared object file:: + + sudo ln -s /path/to/system/site-packages/cv2.so ~/path/to/virtualenv/site-packages/cv2.so + +Finally, `cd` into the project directory and install by doing:: + + make install + +.. _virtual environment: http://virtualenvwrapper.readthedocs.io/en/latest/install.html#basic-installation +.. _virtualenvwrapper: https://virtualenvwrapper.readthedocs.io/en/latest/ + +API Reference +============= + +See :doc:`API doc `. + +Development +=========== + +Code +---- + +You can check the latest sources with the command:: + + git clone https://github.com/socialcopsdev/camelot.git + +Contributing +------------ + +See :doc:`Contributing doc `. + +Testing +------- + +:: + + make test + +License +======= + +BSD License \ No newline at end of file diff --git a/docs/lattice.rst b/docs/lattice.rst new file mode 100644 index 0000000..b4a0fda --- /dev/null +++ b/docs/lattice.rst @@ -0,0 +1,193 @@ +.. _lattice: + +======= +Lattice +======= + +Lattice method is designed to work on PDFs which have tables with well-defined grids. It looks for lines on a page to form a table representation. + +Lattice uses OpenCV to apply a set of morphological transformations (erosion and dilation) to find horizontal and vertical line segments in a PDF page after converting it to an image using imagemagick. + +.. note:: Currently, Lattice only works on PDFs that contain text i.e. they are not composed of an image of the text. However, we plan to add `OCR support`_ in the future. + +.. _OCR support: https://github.com/socialcopsdev/camelot/issues/14 + +Let's see how Lattice processes this PDF, step by step. + +Line segments are detected in the first step. + +.. .. _this: insert link for us-030.pdf + +.. image:: assets/line.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +The detected line segments are overlapped by `and` ing their pixel intensities to find intersections. + +.. image:: assets/intersection.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +The detected line segments are overlapped again, this time by `or` ing their pixel intensities and outermost contours are computed to identify potential table boundaries. This helps Lattice in detecting more than one table on a single page. + +.. image:: assets/contour.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +Since dimensions of a PDF and its image vary; table contours, intersections and segments are scaled and translated to the PDF's coordinate space. A representation of the table is then created using these scaled coordinates. + +.. image:: assets/table.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +Spanning cells are then detected using the line segments and intersections. + +.. image:: assets/table_span.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +Finally, the characters found on the page are assigned to cells based on their x,y coordinates. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.lattice import Lattice + + >>> extractor = Lattice(Pdf('us-030.pdf')) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" + + "","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" + "2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" + "2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" + "4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" + "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" + "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" + +Scale +----- + +The scale parameter is used to determine the length of the structuring element used for morphological transformations. The length of vertical and horizontal structuring elements are found by dividing the image's height and width respectively, by `scale`. Large `scale` will lead to a smaller structuring element, which means that smaller lines will be detected. The default value for scale is 15. + +Let's consider this PDF. + +.. .. _this: insert link for row_span_1.pdf + +.. image:: assets/scale_1.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +Clearly, it couldn't detected those small lines in the lower left part. Therefore, we need to increase the value of scale. Let's try a value of 40. + +.. image:: assets/scale_2.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +Voila! It detected the smaller lines. + +Fill +---- + +In the PDF used above, you can see that some cells spanned a lot of rows, `fill` just copies the same value to all rows/columns of a spanning cell. You can apply fill horizontally, vertically or both. Let us fill the output for the PDF we used above, vertically. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.lattice import Lattice + + >>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + :header: "Plan Type","County","Plan Name","Totals" + + "GMC","Sacramento","Anthem Blue Cross","164,380" + "GMC","Sacramento","Health Net","126,547" + "GMC","Sacramento","Kaiser Foundation","74,620" + "GMC","Sacramento","Molina Healthcare","59,989" + "GMC","San Diego","Care 1st Health Plan","71,831" + "GMC","San Diego","Community...","264,639" + "GMC","San Diego","Health Net","72,404" + "GMC","San Diego","Kaiser","50,415" + "GMC","San Diego","Molina Healthcare","206,430" + "GMC","Total GMC...","","1,091,255" + "COHS","Marin","Partnership Health...","36,006" + "COHS","Mendocino","Partnership Health...","37,243" + "COHS","Napa","Partnership Health...","28,398" + "COHS","Solano","Partnership Health...","113,220" + "COHS","Sonoma","Partnership Health...","112,271" + "COHS","Yolo","Partnership Health...","52,674" + "COHS","Del Norte","Partnership Health...","11,242" + "COHS","Humboldt","Partnership Health...","49,911" + "COHS","Lake","Partnership Health...","29,149" + "COHS","Lassen","Partnership Health...","7,360" + "COHS","Modoc","Partnership Health...","2,940" + "COHS","Shasta","Partnership Health...","61,763" + "COHS","Siskiyou","Partnership Health...","16,715" + "COHS","Trinity","Partnership Health...","4,542" + "COHS","Merced","Central California...","123,907" + "COHS","Monterey","Central California...","147,397" + "COHS","Santa Cruz","Central California...","69,458" + "COHS","Santa Barbara","CenCal","117,609" + "COHS","San Luis Obispo","CenCal","55,761" + "COHS","Orange","CalOptima","783,079" + "COHS","San Mateo","Health Plan...","113,202" + "COHS","Ventura","Gold Coast...","202,217" + "COHS","Total COHS...","","2,176,064" + "Subtotal for...","","","10,132,022" + "PCCM","Los Angeles","AIDS Healthcare...","828" + "PCCM","San Francisco","Family Mosaic","25" + "PCCM","Total PHP...","","853" + "All Models...","","","10,132,875" + "Source: Data...","","","" + +Invert +------ + +To find line segments, Lattice needs the lines of the PDF to be in foreground. So, if you encounter a PDF like this, just set invert to True. + +.. .. _this: insert link for lines_in_background_1.pdf + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.lattice import Lattice + + >>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" + + "Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000" + "Rajasthan","2.12.2009 to 19.12.2009","","","","","","" + "Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453" + "Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153" + "Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183" + "Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855" + "Total","","47","92","11.81","22,455","19,584","10,644" + +Lattice can also parse PDFs with tables like these that are rotated clockwise/anti-clockwise by 90 degrees. + +.. .. _these: insert link for left_rotated_table.pdf + +You can call Lattice with debug={'line', 'intersection', 'contour', 'table'}, and call `plot_geometry()` which will generate an image like the ones on this page, with the help of which you can modify various parameters. See :doc:`API doc ` for more information. diff --git a/docs/stream.rst b/docs/stream.rst new file mode 100644 index 0000000..31adde2 --- /dev/null +++ b/docs/stream.rst @@ -0,0 +1,249 @@ +.. _stream: + +====== +Stream +====== + +Stream method is the complete opposite of Lattice and works on PDFs which have text placed uniformly apart across rows to simulate a table. It looks for spaces between text to form a table representation. + +Stream builds on top of PDFMiner's functionality of grouping characters on a page into words and sentences. After getting these words, it groups them into rows based on their y-coordinates and tries to guess the number of columns a PDF table might have by calculating the mode of the number of words in each row. Additionally, the user can specify the number of columns or column x-coordinates. + +Let's run it on this PDF. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.stream import Stream + + >>> extractor = Stream(Pdf('eu-027.pdf')) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. .. _this: insert link for eu-027.pdf + +.. csv-table:: + + "C","Appendix C:...","","","" + "","Table C1:...","","","" + "","This table...","","","" + "Variable","Mean","Std. Dev.","Min","Max" + "Age","50.8","15.9","21","90" + "Men","0.47","0.50","0","1" + "East","0.28","0.45","0","1" + "Rural","0.15","0.36","0","1" + "Married","0.57","0.50","0","1" + "Single","0.21","0.40","0","1" + "Divorced","0.13","0.33","0","1" + "Widowed","0.08","0.26","0","1" + "Separated","0.03","0.16","0","1" + "Partner","0.65","0.48","0","1" + "Employed","0.55","0.50","0","1" + "Fulltime","0.34","0.47","0","1" + "Parttime","0.20","0.40","0","1" + "Unemployed","0.08","0.28","0","1" + "Homemaker","0.19","0.40","0","1" + "Retired","0.28","0.45","0","1" + "Household size","2.43","1.22","1","9" + "Households...","0.37","0.48","0","1" + "Number of...","1.67","1.38","0","8" + "Lower...","0.08","0.27","0","1" + "Upper...","0.60","0.49","0","1" + "Post...","0.12","0.33","0","1" + "First...","0.17","0.38","0","1" + "Other...","0.03","0.17","0","1" + "Household...","2,127","1,389","22","22,500" + "Gross...","187,281","384,198","0","7,720,000" + "Gross...","38,855","114,128","0","2,870,000" + "","Source:...","","","" + "","","","","ECB" + "","","","","Working..." + "","","","","Febuary..." + +But sometimes its guess could be incorrect, like in this case. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.stream import Stream + + >>> extractor = Stream(Pdf('missing_values.pdf')) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. .. _this: insert link for missing_values.pdf + +.. csv-table:: + + "Bhandara...","","" + "","DLHS-4...","DLHS-3..." + "Indicators","TOTAL","RURAL TOTAL RURAL" + "Reported Prevalence of Morbidity","","" + "Any Injury...","1.9","2.1" + "Acute Illness...","4.5","5.6" + "Chronic Illness...","5.1","4.1" + "Reported Prevalence of Chronic Illness during last one year (%)","","" + "Disease of respiratory system...","11.7","15.0" + "Disease of cardiovascular system...","8.9","9.3" + "Persons suffering from tuberculosis...","2.2","1.5" + "Anaemia Status by Haemoglobin Level14 (%)","","" + "Children (6-59 months) having anaemia...","68.5","71.9" + "Children (6-59 months) having severe anaemia...","6.7","9.4" + "Children (6-9 Years) having anaemia - Male...","67.1","71.4" + "Children (6-9 Years) having severe anaemia - Male...","4.4","2.4" + "Children (6-9 Years) having anaemia - Female...","52.4","48.8" + "Children (6-9 Years) having severe anaemia - Female...","1.2","0.0" + "Children (6-14 years) having anaemia - Male...","50.8","62.5" + "Children (6-14 years) having severe anaemia - Male...","3.7","3.6" + "Children (6-14 years) having anaemia - Female...","48.3","50.0" + "Children (6-14 years) having severe anaemia - Female...","4.3","6.1" + "Children (10-19 Years15) having anaemia - Male...","37.9","51.2" + "Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0" + "Children (10-19 Years15) having anaemia - Female...","46.6","52.1" + "Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5" + "Adolescents (15-19 years) having anaemia...","39.4","46.5" + "Adolescents (15-19 years) having severe anaemia...","5.4","5.1" + "Pregnant women (15-49 aged) having anaemia...","48.8","51.5" + "Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8" + "Women (15-49 aged) having anaemia...","45.2","51.7" + "Women (15-49 aged) having severe anaemia...","4.8","5.9" + "Persons (20 years and above) having anaemia...","37.8","42.1" + "Persons (20 years and above) having Severe anaemia...","4.6","4.8" + "Blood Sugar Level (age 18 years and above) (%)","","" + "Blood Sugar Level >140 mg/dl (high)...","12.9","11.1" + "Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1" + "Hypertension (age 18 years and above) (%)","","" + "Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8" + "Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1" + "Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1" + "14...","","" + "Chronic...","","" + +It guessed that the PDF has 3 columns, because there wasn't any data in the last 2 columns for most rows. So, let's specify the number of columns explicitly, following which, Stream will only consider rows that have 5 words, to decide on column boundaries. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.stream import Stream + + >>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5) + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + + "Bhandara...","","","","" + "","DLHS-4...","DLHS-3...","","" + "Indicators","TOTAL","RURAL","TOTAL","RURAL" + "Reported Prevalence of Morbidity","","","","" + "Any Injury...","1.9","2.1","","" + "Acute Illness...","4.5","5.6","","" + "Chronic Illness...","5.1","4.1","","" + "Reported Prevalence of Chronic Illness during last one year (%)","","","","" + "Disease of respiratory system...","11.7","15.0","","" + "Disease of cardiovascular system...","8.9","9.3","","" + "Persons suffering from tuberculosis...","2.2","1.5","","" + "Anaemia Status by Haemoglobin Level14 (%)","","","","" + "Children (6-59 months) having anaemia...","68.5","71.9","","" + "Children (6-59 months) having severe anaemia...","6.7","9.4","","" + "Children (6-9 Years) having anaemia - Male...","67.1","71.4","","" + "Children (6-9 Years) having severe anaemia - Male...","4.4","2.4","","" + "Children (6-9 Years) having anaemia - Female...","52.4","48.8","","" + "Children (6-9 Years) having severe anaemia - Female...","1.2","0.0","","" + "Children (6-14 years) having anaemia - Male...","50.8","62.5","","" + "Children (6-14 years) having severe anaemia - Male...","3.7","3.6","","" + "Children (6-14 years) having anaemia - Female...","48.3","50.0","","" + "Children (6-14 years) having severe anaemia - Female...","4.3","6.1","","" + "Children (10-19 Years15) having anaemia - Male...","37.9","51.2","","" + "Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0","","" + "Children (10-19 Years15) having anaemia - Female...","46.6","52.1","","" + "Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5","","" + "Adolescents (15-19 years) having anaemia...","39.4","46.5","","" + "Adolescents (15-19 years) having severe anaemia...","5.4","5.1","","" + "Pregnant women (15-49 aged) having anaemia...","48.8","51.5","","" + "Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8","","" + "Women (15-49 aged) having anaemia...","45.2","51.7","","" + "Women (15-49 aged) having severe anaemia...","4.8","5.9","","" + "Persons (20 years and above) having anaemia...","37.8","42.1","","" + "Persons (20 years and above) having Severe anaemia...","4.6","4.8","","" + "Blood Sugar Level (age 18 years and above) (%)","","","","" + "Blood Sugar Level >140 mg/dl (high)...","12.9","11.1","","" + "Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1","","" + "Hypertension (age 18 years and above) (%)","","","","" + "Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8","","" + "Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1","","" + "Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1","","" + "14...","","","","" + "Chronic...","","","","" + +We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this PDF. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.stream import Stream + + >>> extractor = Stream(Pdf('mexican_towns.pdf'), debug=True) + >>> extractor.plot_text() + +.. image:: assets/columns.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +After getting the x-coordinates, we just need to pass them to Stream, like this. + +:: + + >>> from camelot.pdf import Pdf + >>> from camelot.stream import Stream + + >>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700') + >>> tables = extractor.get_tables() + >>> print tables['pg-1'] + +.. csv-table:: + + "Clave","","Clave","","Clave","" + "","Nombre Entidad","","Nombre Municipio","","Nombre Localidad" + "Entidad","","Municipio","","Localidad","" + "01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita" + "01","Aguascalientes","001","Aguascalientes","0096","Agua Azul" + "01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre" + "01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]" + "01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)" + "01","Aguascalientes","001","Aguascalientes","0106","Arellano" + "01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez" + "01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro" + "01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]" + "01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas" + "01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)" + "01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina" + "01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]" + "01","Aguascalientes","001","Aguascalientes","0127","Los Caños" + "01","Aguascalientes","001","Aguascalientes","0128","El Cariñán" + "01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]" + "01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)" + "01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)" + "01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)" + "01","Aguascalientes","001","Aguascalientes","0141","Cobos" + "01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)" + "01","Aguascalientes","001","Aguascalientes","0146","El Conejal" + "01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo" + "01","Aguascalientes","001","Aguascalientes","0162","Coyotes" + "01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)" + "01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)" + "01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)" + "01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]" + "01","Aguascalientes","001","Aguascalientes","0176","La Chiripa" + "01","Aguascalientes","001","Aguascalientes","0182","Dolores" + "01","Aguascalientes","001","Aguascalientes","0183","Los Dolores" + "01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo" + "01","Aguascalientes","001","Aguascalientes","0191","Los Durón" + "01","Aguascalientes","001","Aguascalientes","0197","La Escondida" + "01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]" + "01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo" + "01","Aguascalientes","001","Aguascalientes","0209","La Fortuna" + "01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín" + "01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]" + "01","Aguascalientes","001","Aguascalientes","0216","La Gloria" diff --git a/examples/demo_lattice.py b/examples/demo_lattice.py new file mode 100644 index 0000000..b3ff2ea --- /dev/null +++ b/examples/demo_lattice.py @@ -0,0 +1,11 @@ +from camelot import Pdf +from camelot import Lattice + + +extractor = Lattice(Pdf("files/column_span_1.pdf", clean=True), scale=30) +tables = extractor.get_tables() +print tables + +extractor = Lattice(Pdf("files/column_span_2.pdf"), clean=True, scale=30) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_lattice_fill.py b/examples/demo_lattice_fill.py new file mode 100644 index 0000000..3546b00 --- /dev/null +++ b/examples/demo_lattice_fill.py @@ -0,0 +1,13 @@ +from camelot import Pdf +from camelot import Lattice + + +extractor = Lattice( + Pdf("files/row_span_1.pdf", clean=True), fill='v', scale=40) +tables = extractor.get_tables() +print tables + +extractor = Lattice( + Pdf("files/row_span_2.pdf", clean=True), fill='v', scale=30) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_lattice_invert.py b/examples/demo_lattice_invert.py new file mode 100644 index 0000000..a0bf41e --- /dev/null +++ b/examples/demo_lattice_invert.py @@ -0,0 +1,13 @@ +from camelot import Pdf +from camelot import Lattice + + +extractor = Lattice(Pdf("files/lines_in_background_1.pdf", + clean=True), scale=30, invert=True) +tables = extractor.get_tables() +print tables + +extractor = Lattice(Pdf("files/lines_in_background_2.pdf", + clean=True), scale=30, invert=True) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_lattice_rotation.py b/examples/demo_lattice_rotation.py new file mode 100644 index 0000000..d201cf1 --- /dev/null +++ b/examples/demo_lattice_rotation.py @@ -0,0 +1,11 @@ +from camelot import Pdf +from camelot import Lattice + + +extractor = Lattice(Pdf("files/left_rotated_table.pdf", clean=True), scale=30) +tables = extractor.get_tables() +print tables + +extractor = Lattice(Pdf("files/right_rotated_table.pdf", clean=True), scale=30) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_lattice_twotables.py b/examples/demo_lattice_twotables.py new file mode 100644 index 0000000..91c6b93 --- /dev/null +++ b/examples/demo_lattice_twotables.py @@ -0,0 +1,11 @@ +from camelot import Pdf +from camelot import Lattice + + +extractor = Lattice(Pdf("files/twotables_1.pdf", clean=True), scale=40) +tables = extractor.get_tables() +print tables + +extractor = Lattice(Pdf("files/twotables_2.pdf", clean=True), scale=30) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_stream.py b/examples/demo_stream.py new file mode 100644 index 0000000..baee02f --- /dev/null +++ b/examples/demo_stream.py @@ -0,0 +1,8 @@ +from camelot import Pdf +from camelot import Stream + + +extractor = Stream(Pdf("files/budget_2014-15.pdf", + char_margin=1.0, clean=True)) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_stream_columns.py b/examples/demo_stream_columns.py new file mode 100644 index 0000000..79cc6cb --- /dev/null +++ b/examples/demo_stream_columns.py @@ -0,0 +1,13 @@ +from camelot import Pdf +from camelot import Stream + + +extractor = Stream(Pdf("files/inconsistent_rows.pdf", char_margin=1.0), + columns="65,95,285,640,715,780", ytol=10) +tables = extractor.get_tables() +print tables + +extractor = Stream(Pdf("files/consistent_rows.pdf", char_margin=1.0), + columns="28,67,180,230,425,475,700", ytol=5) +tables = extractor.get_tables() +print tables diff --git a/examples/demo_stream_ncolumns.py b/examples/demo_stream_ncolumns.py new file mode 100644 index 0000000..b220bf1 --- /dev/null +++ b/examples/demo_stream_ncolumns.py @@ -0,0 +1,8 @@ +from camelot import Pdf +from camelot import Stream + + +extractor = Stream(Pdf("files/missing_values.pdf", + char_margin=1.0, clean=True), ncolumns=5) +tables = extractor.get_tables() +print tables diff --git a/examples/files/budget_2014-15.pdf b/examples/files/budget_2014-15.pdf new file mode 100644 index 0000000..9466e87 Binary files /dev/null and b/examples/files/budget_2014-15.pdf differ diff --git a/examples/files/column_span_1.pdf b/examples/files/column_span_1.pdf new file mode 100644 index 0000000..e7c164e Binary files /dev/null and b/examples/files/column_span_1.pdf differ diff --git a/examples/files/column_span_2.pdf b/examples/files/column_span_2.pdf new file mode 100644 index 0000000..5cab903 Binary files /dev/null and b/examples/files/column_span_2.pdf differ diff --git a/examples/files/consistent_rows.pdf b/examples/files/consistent_rows.pdf new file mode 100644 index 0000000..e0213aa Binary files /dev/null and b/examples/files/consistent_rows.pdf differ diff --git a/examples/files/inconsistent_rows.pdf b/examples/files/inconsistent_rows.pdf new file mode 100644 index 0000000..9eb4b63 Binary files /dev/null and b/examples/files/inconsistent_rows.pdf differ diff --git a/examples/files/left_rotated_table.pdf b/examples/files/left_rotated_table.pdf new file mode 100644 index 0000000..8b7a615 Binary files /dev/null and b/examples/files/left_rotated_table.pdf differ diff --git a/examples/files/lines_in_background_1.pdf b/examples/files/lines_in_background_1.pdf new file mode 100644 index 0000000..f23d6b7 Binary files /dev/null and b/examples/files/lines_in_background_1.pdf differ diff --git a/examples/files/lines_in_background_2.pdf b/examples/files/lines_in_background_2.pdf new file mode 100644 index 0000000..b64b2f2 Binary files /dev/null and b/examples/files/lines_in_background_2.pdf differ diff --git a/examples/files/missing_values.pdf b/examples/files/missing_values.pdf new file mode 100644 index 0000000..90b620f Binary files /dev/null and b/examples/files/missing_values.pdf differ diff --git a/examples/files/right_rotated_table.pdf b/examples/files/right_rotated_table.pdf new file mode 100644 index 0000000..9494465 Binary files /dev/null and b/examples/files/right_rotated_table.pdf differ diff --git a/examples/files/row_span_1.pdf b/examples/files/row_span_1.pdf new file mode 100644 index 0000000..ef2c7ce Binary files /dev/null and b/examples/files/row_span_1.pdf differ diff --git a/examples/files/row_span_2.pdf b/examples/files/row_span_2.pdf new file mode 100644 index 0000000..39bce84 Binary files /dev/null and b/examples/files/row_span_2.pdf differ diff --git a/examples/files/twotables_1.pdf b/examples/files/twotables_1.pdf new file mode 100644 index 0000000..cbbeeda Binary files /dev/null and b/examples/files/twotables_1.pdf differ diff --git a/examples/files/twotables_2.pdf b/examples/files/twotables_2.pdf new file mode 100644 index 0000000..5249887 Binary files /dev/null and b/examples/files/twotables_2.pdf differ diff --git a/tests/mexican_towns.pdf b/tests/mexican_towns.pdf new file mode 100644 index 0000000..46cd236 Binary files /dev/null and b/tests/mexican_towns.pdf differ diff --git a/tests/test_lattice.py b/tests/test_lattice.py new file mode 100644 index 0000000..244c937 --- /dev/null +++ b/tests/test_lattice.py @@ -0,0 +1,97 @@ +# coding: utf8 +import os + +from nose.tools import assert_equal + +from camelot.pdf import Pdf +from camelot.lattice import Lattice + + +testdir = os.path.dirname(os.path.abspath(__file__)) + + +def test_lattice_basic(): + + data = [ + ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], + ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], + ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], + ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], + ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], + ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], + ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] + ] + pdfname = os.path.join(testdir, + "tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf") + extractor = Lattice(Pdf(pdfname, + pagenos=[{'start': 2, 'end': 2}], clean=True)) + tables = extractor.get_tables() + assert_equal(tables['pg-2'][0], data) + + +def test_lattice_fill(): + + data = [ + ["Plan Type","County","Plan Name","Totals"], + ["GMC","Sacramento","Anthem Blue Cross","164,380"], + ["GMC","Sacramento","Health Net","126,547"], + ["GMC","Sacramento","Kaiser Foundation","74,620"], + ["GMC","Sacramento","Molina Healthcare","59,989"], + ["GMC","San Diego","Care 1st Health Plan","71,831"], + ["GMC","San Diego","Community Health Group","264,639"], + ["GMC","San Diego","Health Net","72,404"], + ["GMC","San Diego","Kaiser","50,415"], + ["GMC","San Diego","Molina Healthcare","206,430"], + ["GMC","Total GMC Enrollment","","1,091,255"], + ["COHS","Marin","Partnership Health Plan of CA","36,006"], + ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], + ["COHS","Napa","Partnership Health Plan of CA","28,398"], + ["COHS","Solano","Partnership Health Plan of CA","113,220"], + ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], + ["COHS","Yolo","Partnership Health Plan of CA","52,674"], + ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], + ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], + ["COHS","Lake","Partnership Health Plan of CA","29,149"], + ["COHS","Lassen","Partnership Health Plan of CA","7,360"], + ["COHS","Modoc","Partnership Health Plan of CA","2,940"], + ["COHS","Shasta","Partnership Health Plan of CA","61,763"], + ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], + ["COHS","Trinity","Partnership Health Plan of CA","4,542"], + ["COHS","Merced","Central California Alliance for Health","123,907"], + ["COHS","Monterey","Central California Alliance for Health","147,397"], + ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], + ["COHS","Santa Barbara","CenCal","117,609"], + ["COHS","San Luis Obispo","CenCal","55,761"], + ["COHS","Orange","CalOptima","783,079"], + ["COHS","San Mateo","Health Plan of San Mateo","113,202"], + ["COHS","Ventura","Gold Coast Health Plan","202,217"], + ["COHS","Total COHS Enrollment","","2,176,064"], + ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], + ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], + ["PCCM","San Francisco","Family Mosaic","25"], + ["PCCM","Total PHP Enrollment","","853"], + ["All Models Total Enrollments","","","10,132,875"], + ["Source: Data Warehouse 12/14/15","","",""] + ] + pdfname = os.path.join(testdir, 'row_span_1.pdf') + extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40) + tables = extractor.get_tables() + assert_equal(tables['pg-1'][0], data) + + +def test_lattice_invert(): + + data = [ + ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], + ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], + ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], + ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], + ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], + ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], + ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], + ["Total","","47","92","11.81","22,455","19,584","10,644"] + ] + pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') + extractor = Lattice(Pdf(pdfname, clean=True), invert=True) + tables = extractor.get_tables() + assert_equal(tables['pg-1'][1], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py new file mode 100644 index 0000000..34d45bf --- /dev/null +++ b/tests/test_stream.py @@ -0,0 +1,169 @@ +# coding: utf8 +import os + +from nose.tools import assert_equal + +from camelot.pdf import Pdf +from camelot.stream import Stream + + +testdir = os.path.dirname(os.path.abspath(__file__)) + + +def test_stream_basic(): + + data = [ + ["","","","",""], + ["C Appendix C: Summary Statistics","","","",""], + ["","Table C1: Summary Statistics","","",""], + ["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""], + ["Variable","Mean","Std. Dev. Min","","Max"], + ["Age","50.8","15.9","21","90"], + ["Men","0.47","0.50","0","1"], + ["East","0.28","0.45","0","1"], + ["Rural","0.15","0.36","0","1"], + ["Married","0.57","0.50","0","1"], + ["Single","0.21","0.40","0","1"], + ["Divorced","0.13","0.33","0","1"], + ["Widowed","0.08","0.26","0","1"], + ["Separated","0.03","0.16","0","1"], + ["Partner","0.65","0.48","0","1"], + ["Employed","0.55","0.50","0","1"], + ["Fulltime","0.34","0.47","0","1"], + ["Parttime","0.20","0.40","0","1"], + ["Unemployed","0.08","0.28","0","1"], + ["Homemaker","0.19","0.40","0","1"], + ["Retired","0.28","0.45","0","1"], + ["Household size","2.43","1.22","1","9"], + ["Households with children","0.37","0.48","0","1"], + ["Number of children","1.67","1.38","0","8"], + ["Lower secondary education","0.08","0.27","0","1"], + ["Upper secondary education","0.60","0.49","0","1"], + ["Post secondary, non tert. education","0.12","0.33","0","1"], + ["First stage tertiary education","0.17","0.38","0","1"], + ["Other education","0.03","0.17","0","1"], + ["Household income (Euro/month)","2,127","1,389","22","22,500"], + ["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"], + ["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"], + ["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""], + ["","","","","ECB"], + ["","","","","Working Paper Series No 1299"], + ["","","","","Febuary 2011"] + ] + + pdfname = os.path.join(testdir, + "tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf") + extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}], + clean=True)) + tables = extractor.get_tables() + assert_equal(tables['pg-3'][0], data) + + +def test_stream_ncolumns(): + + data = [ + ["","","","",""], + ["","Bhandara - Key Indicators","","",""], + ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""], + ["Indicators","TOTAL","RURAL","TOTAL","RURAL"], + ["Reported Prevalence of Morbidity","","","",""], + ["Any Injury .....................................................................................................................................","1.9","2.1","",""], + ["Acute Illness .................................................................................................................................","4.5","5.6","",""], + ["Chronic Illness ..............................................................................................................................","5.1","4.1","",""], + ["Reported Prevalence of Chronic Illness during last one year (%)","","","",""], + ["Disease of respiratory system ......................................................................................................","11.7","15.0","",""], + ["Disease of cardiovascular system ................................................................................................","8.9","9.3","",""], + ["Persons suffering from tuberculosis .............................................................................................","2.2","1.5","",""], + ["Anaemia Status by Haemoglobin Level14 (%)","","","",""], + ["Children (6-59 months) having anaemia ......................................................................................","68.5","71.9","",""], + ["Children (6-59 months) having severe anaemia ..........................................................................","6.7","9.4","",""], + ["Children (6-9 Years) having anaemia - Male ................................................................................","67.1","71.4","",""], + ["Children (6-9 Years) having severe anaemia - Male ....................................................................","4.4","2.4","",""], + ["Children (6-9 Years) having anaemia - Female ...........................................................................","52.4","48.8","",""], + ["Children (6-9 Years) having severe anaemia - Female ................................................................","1.2","0.0","",""], + ["Children (6-14 years) having anaemia - Male .............................................................................","50.8","62.5","",""], + ["Children (6-14 years) having severe anaemia - Male ..................................................................","3.7","3.6","",""], + ["Children (6-14 years) having anaemia - Female .........................................................................","48.3","50.0","",""], + ["Children (6-14 years) having severe anaemia - Female ..............................................................","4.3","6.1","",""], + ["Children (10-19 Years15) having anaemia - Male .........................................................................","37.9","51.2","",""], + ["Children (10-19 Years15) having severe anaemia - Male .............................................................","3.5","4.0","",""], + ["Children (10-19 Years15) having anaemia - Female .....................................................................","46.6","52.1","",""], + ["Children (10-19 Years15) having severe anaemia - Female .........................................................","6.4","6.5","",""], + ["Adolescents (15-19 years) having anaemia ................................................................................","39.4","46.5","",""], + ["Adolescents (15-19 years) having severe anaemia .....................................................................","5.4","5.1","",""], + ["Pregnant women (15-49 aged) having anaemia ..........................................................................","48.8","51.5","",""], + ["Pregnant women (15-49 aged) having severe anaemia ..............................................................","7.1","8.8","",""], + ["Women (15-49 aged) having anaemia .........................................................................................","45.2","51.7","",""], + ["Women (15-49 aged) having severe anaemia .............................................................................","4.8","5.9","",""], + ["Persons (20 years and above) having anaemia ...........................................................................","37.8","42.1","",""], + ["Persons (20 years and above) having Severe anaemia ..............................................................","4.6","4.8","",""], + ["Blood Sugar Level (age 18 years and above) (%)","","","",""], + ["Blood Sugar Level >140 mg/dl (high) ...........................................................................................","12.9","11.1","",""], + ["Blood Sugar Level >160 mg/dl (very high) ...................................................................................","7.0","5.1","",""], + ["Hypertension (age 18 years and above) (%)","","","",""], + ["Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................","23.8","22.8","",""], + ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], + ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], + ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], + ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""] + ] + pdfname = os.path.join(testdir, 'missing_values.pdf') + extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True), + ncolumns=5) + tables = extractor.get_tables() + assert_equal(tables['pg-1'][0], data) + + +def test_stream_columns(): + + data = [ + ["","","","","",""], + ["Clave","","Clave","","Clave",""], + ["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"], + ["Entidad","","Municipio","","Localidad",""], + ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], + ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], + ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], + ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], + ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], + ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], + ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], + ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], + ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], + ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], + ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], + ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], + ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], + ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], + ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], + ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], + ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], + ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], + ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], + ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], + ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], + ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], + ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], + ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], + ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], + ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], + ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], + ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], + ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], + ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], + ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], + ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], + ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], + ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"] + ] + pdfname = os.path.join(testdir, 'mexican_towns.pdf') + extractor = Stream(Pdf(pdfname, clean=True), + columns='28,67,180,230,425,475,700') + tables = extractor.get_tables() + assert_equal(tables['pg-1'][0], data) \ No newline at end of file