Merge pull request #206 from socialcopsdev/stream-nurminen-detection

[MRG] Add implementation of Anssi Nurminen's table detection algorithm
pull/2/head
Vinayak Mehta 2018-11-23 21:35:03 +05:30 committed by GitHub
commit e7835cac33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 534 additions and 144 deletions

View File

@ -15,7 +15,7 @@ install:
pip install ".[dev]"
test:
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
docs:
cd docs && make html

View File

@ -69,7 +69,7 @@ $ conda install -c conda-forge camelot-py
### Using pip
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
<pre>
$ pip install camelot-py[cv]

View File

@ -1,11 +1,23 @@
# -*- coding: utf-8 -*-
VERSION = (0, 3, 2)
VERSION = (0, 4, 0)
PRERELEASE = None # alpha, beta or rc
REVISION = None
def generate_version(version, prerelease=None, revision=None):
version_parts = ['.'.join(map(str, version))]
if prerelease is not None:
version_parts.append('-{}'.format(prerelease))
if revision is not None:
version_parts.append('.{}'.format(revision))
return ''.join(version_parts)
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION))
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License'

View File

@ -3,11 +3,210 @@
import os
import zipfile
import tempfile
from itertools import chain
from operator import itemgetter
import numpy as np
import pandas as pd
# minimum number of vertical textline intersections for a textedge
# to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending textedge
TEXTEDGE_EXTEND_TOLERANCE = 50
# padding added to table area on the left, right and bottom
TABLE_AREA_PADDING = 10
class TextEdge(object):
"""Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space)
Parameters
----------
x : float
x-coordinate of the text edge.
y0 : float
y-coordinate of bottommost point.
y1 : float
y-coordinate of topmost point.
align : string, optional (default: 'left')
{'left', 'right', 'middle'}
Attributes
----------
intersections: int
Number of intersections with horizontal text rows.
is_valid: bool
A text edge is valid if it intersections with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
"""
def __init__(self, x, y0, y1, align='left'):
self.x = x
self.y0 = y0
self.y1 = y1
self.align = align
self.intersections = 0
self.is_valid = False
def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0):
"""Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute.
"""
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
# a textedge is valid only if it extends uninterrupted
# over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True
class TextEdges(object):
"""Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects.
"""
def __init__(self):
self._textedges = {'left': [], 'right': [], 'middle': []}
@staticmethod
def get_x_coord(textline, align):
"""Returns the x coordinate of a text row based on the
specified alignment.
"""
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
return x_coord[align]
def find(self, x_coord, align):
"""Returns the index of an existing text edge using
the specified x coordinate and alignment.
"""
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord, atol=0.5):
return i
return None
def add(self, textline, align):
"""Adds a new text edge to the current dict.
"""
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def update(self, textline):
"""Updates an existing text edge in the current dict.
"""
for align in ['left', 'right', 'middle']:
x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align)
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(x_coord, textline.y0)
def generate(self, textlines):
"""Generates the text edges dict based on horizontal text
rows.
"""
for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl)
def get_relevant(self):
"""Returns the list of relevant text edges (all share the same
alignment) based on which list intersects horizontal text rows
the most.
"""
intersections_sum = {
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
}
# TODO: naive
# get vertical textedges that intersect maximum number of
# times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]
def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page
calculated using relevant text edges.
"""
def pad(area, average_row_height):
x0 = area[0] - TABLE_AREA_PADDING
y0 = area[1] - TABLE_AREA_PADDING
x1 = area[2] + TABLE_AREA_PADDING
# add a constant since table headers can be relatively up
y1 = area[3] + average_row_height * 5
return (x0, y0, x1, y1)
# sort relevant textedges in reading order
relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
table_areas = {}
for te in relevant_textedges:
if te.is_valid:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
table_areas.pop(found)
updated_area = (
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
table_areas[updated_area] = None
# extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were
# eliminated during textedges generation since numbers and
# chars/words/sentences are often aligned differently.
# drawback: table areas that have paragraphs on their sides
# will include the paragraphs too.
sum_textline_height = 0
for tl in textlines:
sum_textline_height += tl.y1 - tl.y0
found = None
for area in table_areas:
# check for overlap
if tl.y0 >= area[1] and tl.y1 <= area[3]:
found = area
break
if found is not None:
table_areas.pop(found)
updated_area = (
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines))
# add some padding to table areas
table_areas_padded = {}
for area in table_areas:
table_areas_padded[pad(area, average_textline_height)] = None
return table_areas_padded
class Cell(object):
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space)

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import Table
from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
@ -116,7 +116,7 @@ class Stream(BaseParser):
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky
__ = rows.pop(0) # TODO: hacky
return rows
@staticmethod
@ -246,6 +246,31 @@ class Stream(BaseParser):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
Assumes that tables are situated relatively far apart
vertically.
"""
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges()
# generate left, middle and right textedges
textedges.generate(textlines)
# select relevant edges
relevant_textedges = textedges.get_relevant()
# guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found
if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox
def _generate_table_bbox(self):
if self.table_areas is not None:
table_bbox = {}
@ -257,7 +282,8 @@ class Stream(BaseParser):
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
else:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk):
@ -286,10 +312,21 @@ class Stream(BaseParser):
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if len(elements):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn("No tables found in table area {}".format(
table_idx + 1))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
inner_text = []

View File

@ -7,7 +7,7 @@ If you're reading this, you're probably looking to contributing to Camelot. *Tim
This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.
.. _Vinayak Mehta: https://vinayak-mehta.github.io
.. _Vinayak Mehta: https://www.vinayakmehta.com
Code Of Conduct
---------------

View File

@ -92,6 +92,7 @@ This part of the documentation begins with some background information about why
:maxdepth: 2
user/intro
user/install-deps
user/install
user/how-it-works
user/quickstart

View File

@ -5,24 +5,24 @@ How It Works
This part of the documentation includes a high-level explanation of how Camelot extracts tables from PDF files.
You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula`_.
.. _Tabula: https://github.com/tabulapdf/tabula
You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula <https://github.com/tabulapdf/tabula>`_.
.. _stream:
Stream
------
Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It looks for these spaces between text to form a table representation.
Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins <https://euske.github.io/pdfminer/#tools>`_.
It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins`_. After getting the words on a page, it groups them into rows based on their *y* coordinates. It then tries to guess the number of columns the table might have by calculating the mode of the number of words in each row. This mode is used to calculate *x* ranges for the table's columns. It then adds columns to this column range list based on any words that may lie outside or inside the current column *x* ranges.
1. Words on the PDF page are grouped into text rows based on their *y* axis overlaps.
.. _margins: https://euske.github.io/pdfminer/#tools
2. Textedges are calculated and then used to guess interesting table areas on the PDF page. You can read `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_ to know more about this table detection technique. [See pages 20, 35 and 40]
.. note:: By default, Stream treats the whole PDF page as a table, which isn't ideal when there are more than two tables on a page with different number of columns. Automatic table detection for Stream is `in the works`_.
3. The number of columns inside each table area are then guessed. This is done by calculating the mode of number of words in each text row. Based on this mode, words in each text row are chosen to calculate a list of column *x* ranges.
.. _in the works: https://github.com/socialcopsdev/camelot/issues/102
4. Words that lie inside/outside the current column *x* ranges are then used to extend extend the current list of columns.
5. Finally, a table is formed using the text rows' *y* ranges and column *x* ranges and words found on the page are assigned to the table's cells based on their *x* and *y* coordinates.
.. _lattice:

View File

@ -0,0 +1,76 @@
.. _install_deps:
Installation of dependencies
============================
The dependencies `Tkinter`_ and `ghostscript`_ can be installed using your system's package manager. You can run one of the following, based on your OS.
.. _Tkinter: https://wiki.python.org/moin/TkInter
.. _ghostscript: https://www.ghostscript.com
OS-specific instructions
------------------------
For Ubuntu
^^^^^^^^^^
::
$ apt install python-tk ghostscript
Or for Python 3::
$ apt install python3-tk ghostscript
For macOS
^^^^^^^^^
::
$ brew install tcl-tk ghostscript
For Windows
^^^^^^^^^^^
For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
.. _as shown here: https://java.com/en/download/help/path.xml
Checks to see if dependencies were installed correctly
------------------------------------------------------
You can do the following checks to see if the dependencies were installed correctly.
For Tkinter
^^^^^^^^^^^
Launch Python, and then at the prompt, type::
>>> import Tkinter
Or in Python 3::
>>> import tkinter
If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
For ghostscript
^^^^^^^^^^^^^^^
Run the following to check the ghostscript version.
For Ubuntu/macOS::
$ gs -version
For Windows::
C:\> gswin64c.exe -version
Or for Windows 32-bit::
C:\> gswin32c.exe -version
If you have ghostscript, you should see the ghostscript version and copyright information.

View File

@ -3,7 +3,7 @@
Installation of Camelot
=======================
This part of the documentation covers how to install Camelot.
This part of the documentation covers the steps to install Camelot.
Using conda
-----------
@ -23,84 +23,17 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
Using pip
---------
First, you'll need to install the dependencies, which include `Tkinter`_ and `ghostscript`_.
After :ref:`installing the dependencies <install_deps>`, which include `Tkinter`_ and `ghostscript`_, you can simply use pip to install Camelot::
$ pip install camelot-py[cv]
.. _Tkinter: https://wiki.python.org/moin/TkInter
.. _ghostscript: https://www.ghostscript.com
These can be installed using your system's package manager. You can run one of the following, based on your OS.
For Ubuntu
^^^^^^^^^^
::
$ apt install python-tk ghostscript
Or for Python 3::
$ apt install python3-tk ghostscript
For macOS
^^^^^^^^^
::
$ brew install tcl-tk ghostscript
For Windows
^^^^^^^^^^^
For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
.. _as shown here: https://java.com/en/download/help/path.xml
----
You can do the following checks to see if the dependencies were installed correctly.
For Tkinter
^^^^^^^^^^^
Launch Python, and then at the prompt, type::
>>> import Tkinter
Or in Python 3::
>>> import tkinter
If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
For ghostscript
^^^^^^^^^^^^^^^
Run the following to check the ghostscript version.
For Ubuntu/macOS::
$ gs -version
For Windows::
C:\> gswin64c.exe -version
Or for Windows 32-bit::
C:\> gswin32c.exe -version
If you have ghostscript, you should see the ghostscript version and copyright information.
Finally, you can use pip to install Camelot::
$ pip install camelot-py[cv]
From the source code
--------------------
After `installing the dependencies`_, you can install from the source by:
After :ref:`installing the dependencies <install_deps>`, you can install from the source by:
1. Cloning the GitHub repository.
::
@ -112,5 +45,3 @@ After `installing the dependencies`_, you can install from the source by:
$ cd camelot
$ pip install ".[cv]"
.. _installing the dependencies: https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip

View File

@ -2,5 +2,5 @@
test=pytest
[tool:pytest]
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
python_files = tests/test_*.py

View File

@ -33,52 +33,138 @@ data_stream = [
["Nagaland", "2,368,724", "204,329", "226,400", "0", "2,799,453", "783,054", "3,582,507"],
["Odisha", "14,317,179", "2,552,292", "1,107,250", "0", "17,976,721", "451,438", "18,428,159"],
["Puducherry", "4,191,757", "52,249", "192,400", "0", "4,436,406", "2,173", "4,438,579"],
["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"],
["", "Health Sector Financing by Centre and States/UTs in India [2009-10 to 2012-13](Revised) P a g e |23", "", "", "", "", "", ""]
["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"]
]
data_stream_table_rotated = [
["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""],
["", "", "", "Any", "", "", "", "", "", "", "Other", "Any", "", "", "", "Not", "", "Number"],
["", "", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"],
["", "Background characteristic", "method", "method", "sterilization", "sterilization", "Pill", "IUD", "Injectables", "Nirodh", "method", "method", "Rhythm", "drawal", "method", "using", "Total", "women"],
["", "Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "Scheduled caste", "74.8", "55.8", "42.9", "0.9", "9.7", "0.0", "0.2", "2.2", "0.0", "19.0", "11.2", "7.4", "0.4", "25.2", "100.0", "1,363"],
["", "Scheduled tribe", "59.3", "39.0", "26.8", "0.6", "6.4", "0.6", "1.2", "3.5", "0.0", "20.3", "10.4", "5.8", "4.1", "40.7", "100.0", "256"],
["", "Other backward class", "71.4", "51.1", "34.9", "0.0", "8.6", "1.4", "0.0", "6.2", "0.0", "20.4", "12.6", "7.8", "0.0", "28.6", "100.0", "211"],
["", "Other", "71.1", "48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"],
["", "Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "Lowest", "64.5", "48.6", "34.3", "0.5", "10.5", "0.6", "0.7", "2.0", "0.0", "15.9", "9.9", "4.6", "1.4", "35.5", "100.0", "1,258"],
["", "Second", "68.5", "50.4", "36.2", "1.1", "11.4", "0.5", "0.1", "1.1", "0.0", "18.1", "11.2", "6.7", "0.2", "31.5", "100.0", "1,317"],
["", "Middle", "75.5", "52.8", "33.6", "0.6", "14.2", "0.4", "0.5", "3.4", "0.1", "22.7", "13.4", "8.9", "0.4", "24.5", "100.0", "1,018"],
["", "Fourth", "73.9", "52.3", "32.0", "0.5", "12.5", "0.6", "0.2", "6.3", "0.2", "21.6", "11.5", "9.9", "0.2", "26.1", "100.0", "908"],
["", "Highest", "78.3", "44.4", "19.5", "1.0", "9.7", "1.4", "0.0", "12.7", "0.0", "33.8", "18.2", "15.6", "0.0", "21.7", "100.0", "733"],
["", "Number of living children", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "No children", "25.1", "7.6", "0.3", "0.5", "2.0", "0.0",
["Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""],
["", "", "Any", "", "", "", "", "", "", "Other", "Any", "", "", "", "Not", "", "Number"],
["", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"],
["Background characteristic", "method", "method", "sterilization", "sterilization", "Pill", "IUD", "Injectables", "Nirodh", "method", "method", "Rhythm", "drawal", "method", "using", "Total", "women"],
["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["Scheduled caste", "74.8", "55.8", "42.9", "0.9", "9.7", "0.0", "0.2", "2.2", "0.0", "19.0", "11.2", "7.4", "0.4", "25.2", "100.0", "1,363"],
["Scheduled tribe", "59.3", "39.0", "26.8", "0.6", "6.4", "0.6", "1.2", "3.5", "0.0", "20.3", "10.4", "5.8", "4.1", "40.7", "100.0", "256"],
["Other backward class", "71.4", "51.1", "34.9", "0.0", "8.6", "1.4", "0.0", "6.2", "0.0", "20.4", "12.6", "7.8", "0.0", "28.6", "100.0", "211"],
["Other", "71.1", "48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"],
["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["Lowest", "64.5", "48.6", "34.3", "0.5", "10.5", "0.6", "0.7", "2.0", "0.0", "15.9", "9.9", "4.6", "1.4", "35.5", "100.0", "1,258"],
["Second", "68.5", "50.4", "36.2", "1.1", "11.4", "0.5", "0.1", "1.1", "0.0", "18.1", "11.2", "6.7", "0.2", "31.5", "100.0", "1,317"],
["Middle", "75.5", "52.8", "33.6", "0.6", "14.2", "0.4", "0.5", "3.4", "0.1", "22.7", "13.4", "8.9", "0.4", "24.5", "100.0", "1,018"],
["Fourth", "73.9", "52.3", "32.0", "0.5", "12.5", "0.6", "0.2", "6.3", "0.2", "21.6", "11.5", "9.9", "0.2", "26.1", "100.0", "908"],
["Highest", "78.3", "44.4", "19.5", "1.0", "9.7", "1.4", "0.0", "12.7", "0.0", "33.8", "18.2", "15.6", "0.0", "21.7", "100.0", "733"],
["Number of living children", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["No children", "25.1", "7.6", "0.3", "0.5", "2.0", "0.0",
"0.0", "4.8", "0.0", "17.5", "9.0", "8.5", "0.0", "74.9", "100.0", "563"],
["", "1 child", "66.5", "32.1", "3.7", "0.7", "20.1", "0.7", "0.1", "6.9", "0.0", "34.3", "18.9", "15.2", "0.3", "33.5", "100.0", "1,190"],
["\x18\x18", "1 son", "66.8", "33.2", "4.1", "0.7", "21.1", "0.5", "0.3", "6.6", "0.0", "33.5", "21.2", "12.3", "0.0", "33.2", "100.0", "672"],
["", "No sons", "66.1", "30.7", "3.1", "0.6", "18.8", "0.8", "0.0", "7.3", "0.0", "35.4", "15.8", "19.0", "0.6", "33.9", "100.0", "517"],
["", "2 children", "81.6", "60.5", "41.8", "0.9", "11.6", "0.8", "0.3", "4.8", "0.2", "21.1", "12.2", "8.3", "0.6", "18.4", "100.0", "1,576"],
["", "1 or more sons", "83.7", "64.2", "46.4", "0.9", "10.8", "0.8", "0.4", "4.8", "0.1", "19.5", "11.1", "7.6", "0.7", "16.3", "100.0", "1,268"],
["", "No sons", "73.2", "45.5", "23.2", "1.0", "15.1", "0.9", "0.0", "4.8", "0.5", "27.7", "16.8", "11.0", "0.0", "26.8", "100.0", "308"],
["", "3 children", "83.9", "71.2", "57.7", "0.8", "9.8", "0.6", "0.5", "1.8", "0.0", "12.7", "8.7", "3.3", "0.8", "16.1", "100.0", "961"],
["", "1 or more sons", "85.0", "73.2", "60.3", "0.9", "9.4", "0.5", "0.5", "1.6", "0.0", "11.8", "8.1", "3.0", "0.7", "15.0", "100.0", "860"],
["", "No sons", "74.7", "53.8", "35.3", "0.0", "13.7", "1.6", "0.0", "3.2", "0.0", "20.9", "13.4", "6.1", "1.5", "25.3", "100.0", "101"],
["", "4+ children", "74.3", "58.1", "45.1", "0.6", "8.7", "0.6", "0.7", "2.4", "0.0", "16.1", "9.9", "5.4", "0.8", "25.7", "100.0", "944"],
["", "1 or more sons", "73.9", "58.2", "46.0", "0.7", "8.3", "0.7", "0.7", "1.9", "0.0", "15.7", "9.4", "5.5", "0.8", "26.1", "100.0", "901"],
["", "No sons", "(82.1)", "(57.3)", "(25.6)", "(0.0)", "(17.8)", "(0.0)", "(0.0)", "(13.9)", "(0.0)", "(24.8)", "(21.3)", "(3.5)", "(0.0)", "(17.9)", "100.0", "43"],
["", "Total", "71.2", "49.9", "32.2",
["1 child", "66.5", "32.1", "3.7", "0.7", "20.1", "0.7", "0.1", "6.9", "0.0", "34.3", "18.9", "15.2", "0.3", "33.5", "100.0", "1,190"],
["1 son", "66.8", "33.2", "4.1", "0.7", "21.1", "0.5", "0.3", "6.6", "0.0", "33.5", "21.2", "12.3", "0.0", "33.2", "100.0", "672"],
["No sons", "66.1", "30.7", "3.1", "0.6", "18.8", "0.8", "0.0", "7.3", "0.0", "35.4", "15.8", "19.0", "0.6", "33.9", "100.0", "517"],
["2 children", "81.6", "60.5", "41.8", "0.9", "11.6", "0.8", "0.3", "4.8", "0.2", "21.1", "12.2", "8.3", "0.6", "18.4", "100.0", "1,576"],
["1 or more sons", "83.7", "64.2", "46.4", "0.9", "10.8", "0.8", "0.4", "4.8", "0.1", "19.5", "11.1", "7.6", "0.7", "16.3", "100.0", "1,268"],
["No sons", "73.2", "45.5", "23.2", "1.0", "15.1", "0.9", "0.0", "4.8", "0.5", "27.7", "16.8", "11.0", "0.0", "26.8", "100.0", "308"],
["3 children", "83.9", "71.2", "57.7", "0.8", "9.8", "0.6", "0.5", "1.8", "0.0", "12.7", "8.7", "3.3", "0.8", "16.1", "100.0", "961"],
["1 or more sons", "85.0", "73.2", "60.3", "0.9", "9.4", "0.5", "0.5", "1.6", "0.0", "11.8", "8.1", "3.0", "0.7", "15.0", "100.0", "860"],
["No sons", "74.7", "53.8", "35.3", "0.0", "13.7", "1.6", "0.0", "3.2", "0.0", "20.9", "13.4", "6.1", "1.5", "25.3", "100.0", "101"],
["4+ children", "74.3", "58.1", "45.1", "0.6", "8.7", "0.6", "0.7", "2.4", "0.0", "16.1", "9.9", "5.4", "0.8", "25.7", "100.0", "944"],
["1 or more sons", "73.9", "58.2", "46.0", "0.7", "8.3", "0.7", "0.7", "1.9", "0.0", "15.7", "9.4", "5.5", "0.8", "26.1", "100.0", "901"],
["No sons", "(82.1)", "(57.3)", "(25.6)", "(0.0)", "(17.8)", "(0.0)", "(0.0)", "(13.9)", "(0.0)", "(24.8)", "(21.3)", "(3.5)", "(0.0)", "(17.9)", "100.0", "43"],
["Total", "71.2", "49.9", "32.2",
"0.7", "11.7", "0.6", "0.3", "4.3", "0.1", "21.3", "12.3", "8.4", "0.5", "28.8", "100.0", "5,234"],
["", "NFHS-2 (1998-99)", "66.6", "47.3", "32.0", "1.8", "9.2", "1.4", "na", "2.9", "na", "na", "8.7", "9.8", "na", "33.4", "100.0", "4,116"],
["", "NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"],
["", "", "Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "not shown separately.", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "na = Not available", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "ns = Not shown; see table 2b, footnote 1", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "( ) Based on 25-49 unweighted cases.", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
["NFHS-2 (1998-99)", "66.6", "47.3", "32.0", "1.8", "9.2", "1.4", "na", "2.9", "na", "na", "8.7", "9.8", "na", "33.4", "100.0", "4,116"],
["NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"]
]
data_stream_two_tables_1 = [
["[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", "", "", "", "", "", "", "", "", ""],
["Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "", "", "", "", "", "", "", "", ""],
["by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", "", "", "", "", "", "", "", "", ""],
["could represent multiple arrests of the same person. See text, this section and source]", "", "", "", "", "", "", "", "", ""],
["", "", "Total", "", "", "Male", "", "", "Female", ""],
["Offense charged", "", "Under 18", "18 years", "", "Under 18", "18 years", "", "Under 18", "18 years"],
["", "Total", "years", "and over", "Total", "years", "and over", "Total", "years", "and over"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"],
["Violent crime . . . . . . . . . . . . . . . . . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"],
["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""],
["manslaughter . . . . . . . .. .. .. .. ..", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "", "1.0"],
["Forcible rape . . . . . . . .. .. .. .. .. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "", "", ""],
["Robbery . . . .. .. . .. . ... . ... . ...", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"],
["Aggravated assault . . . . . . . .. .. ..", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"],
["Property crime . . . . . . . . . . . . . . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"],
["Burglary . .. . . . . .. ... .... .... ..", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"],
["Larceny-theft . . . . . . . .. .. .. .. .. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"],
["Motor vehicle theft . . . . .. .. . .... .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"],
["Arson .. . . . .. . ... .... .... .... .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"],
["Other assaults .. . . . . .. . ... . ... ..", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"],
["Forgery and counterfeiting .. . . . . . ..", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"],
["Fraud .... .. . . .. ... .... .... ....", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"],
["Embezzlement . . .. . . . .. . ... . ....", "14.6", "", "14.1", "7.2", "", "6.9", "7.4", "", "7.2"],
["Stolen property 1 . . . . . . .. . .. .. ...", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"],
["Vandalism . . . . . . . .. .. .. .. .. ....", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"],
["Weapons; carrying, possessing, etc. .", "132.9", "27.1", "105.8", "122.1", "24.3", "97.8", "10.8", "2.8", "8.0"],
["Prostitution and commercialized vice",
"56.9", "1.1", "55.8", "17.3", "", "17.1", "39.6", "0.8", "38.7"],
["Sex offenses 2 . . . . .. . . . .. .. .. . ..", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"],
["Drug abuse violations . . . . . . . .. ...", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"],
["Gambling .. . . . . .. ... . ... . ... ...", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "", "0.9"],
["Offenses against the family and", "", "", "", "", "", "", "", "", ""],
["children . . . .. . . .. .. .. .. .. .. . ..", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"],
["Driving under the influence . . . . . .. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"],
["Liquor laws . . . . . . . .. .. .. .. .. .. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4",
"131.4", "34.7", "96.6"],
["Drunkenness . . .. . . . .. . ... . ... ..", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"],
["Disorderly conduct . .. . . . . . .. .. .. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"],
["Vagrancy . . . .. . . . ... .... .... ...", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"],
["All other offenses (except traffic) . . ..", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"],
["Suspicion . . . .. . . .. .. .. .. .. .. . ..", "1.6", "", "1.4", "1.2", "", "1.0", "", "", ""],
["Curfew and loitering law violations ..", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"],
["Runaways . . . . . . . .. .. .. .. .. ....", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"],
["", " Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", "", "", "", "", ""]
]
data_stream_two_tables_2 = [
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", ""],
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
["[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", ""],
["with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", ""],
["", "", "", "", "American", ""],
["Offense charged", "", "", "",
"Indian/Alaskan", "Asian Pacific"],
["", "Total", "White", "Black", "Native", "Islander"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"],
["Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "456,965", "268,346", "177,766", "5,608", "5,245"],
["Murder and nonnegligent manslaughter . .. ... .", "9,739", "4,741", "4,801", "100", "97"],
["Forcible rape . . . . . . . .. .. .. .. .... .. ...... .", "16,362", "10,644", "5,319", "169", "230"],
["Robbery . . . . .. . . . ... . ... . .... .... .... . . .", "100,496", "43,039", "55,742", "726", "989"],
["Aggravated assault . . . . . . . .. .. ...... .. ....", "330,368", "209,922", "111,904", "4,613", "3,929"],
["Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .", "1,364,409", "922,139", "406,382", "17,599", "18,289"],
["Burglary . . .. . . . .. . .... .... .... .... ... . . .", "234,551", "155,994", "74,419", "2,021", "2,117"],
["Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .", "1,056,473", "719,983", "306,625", "14,646", "15,219"],
["Motor vehicle theft . . . . . .. ... . ... ..... ... ..", "63,919", "39,077", "23,184", "817", "841"],
["Arson .. . . .. .. .. ... .... .... .... .... . . . . .", "9,466", "7,085", "2,154", "115", "112"],
["Other assaults .. . . . . . ... . ... . ... ..... ... ..", "1,032,502", "672,865", "332,435", "15,127", "12,075"],
["Forgery and counterfeiting .. . . . . . ... ..... .. ..", "67,054", "44,730", "21,251", "345", "728"],
["Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"],
["Embezzlement . . . .. . . . ... . ... . .... ... .....", "13,960", "9,208", "4,429", "75", "248"],
["Stolen property; buying, receiving, possessing .. .", "82,714", "51,953", "29,357", "662", "742"],
["Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .", "212,173", "157,723", "48,746", "3,352", "2,352"],
["Weapons—carrying, possessing, etc. .. .. ... .. .", "130,503", "74,942", "53,441", "951", "1,169"],
["Prostitution and commercialized vice . ... .. .. ..", "56,560", "31,699", "23,021", "427", "1,413"],
["Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .", "60,175", "44,240", "14,347", "715", "873"],
["Drug abuse violations . . . . . . . .. . ..... .. .....", "1,301,629", "845,974", "437,623", "8,588", "9,444"],
["Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .", "8,046", "2,290", "5,518", "27", "211"],
["Offenses against the family and children ... .. .. .", "87,232", "58,068", "26,850", "1,690", "624"],
["Driving under the influence . . . . . . .. ... ...... .", "1,105,401", "954,444", "121,594", "14,903", "14,460"],
["Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....", "444,087", "373,189", "50,431", "14,876", "5,591"],
["Drunkenness . .. . . . . . ... . ... . ..... . .......", "469,958", "387,542", "71,020", "8,552", "2,844"],
["Disorderly conduct . . .. . . . . .. .. . ..... .. .....", "515,689", "326,563", "176,169", "8,783", "4,174"],
["Vagrancy . . .. .. . . .. ... .... .... .... .... . . .", "26,347", "14,581", "11,031", "543", "192"],
["All other offenses (except traffic) . .. .. .. ..... ..", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"],
["Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .", "1,513", "677", "828", "1", "7"],
["Curfew and loitering law violations . .. ... .. ....", "89,578", "54,439", "33,207", "872", "1,060"],
["Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .", "73,616", "48,343", "19,670", "1,653", "3,950"],
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", "", "", "", ""]
]
data_stream_table_areas = [
@ -187,14 +273,10 @@ data_stream_split_text = [
["", "", "", "", "1522 WEST LINDSEY", "", "", "", "", ""],
["632575", "BAW", "BASHU LEGENDS", "HYH HE CHUANG LLC", "STREET", "NORMAN", "OK", "73069", "-", "2014/07/21"],
["", "", "", "DEEP FORK HOLDINGS", "", "", "", "", "", ""],
["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"],
["", "", "", "", "Page 1 of 151", "", "", "", "", ""]
["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"]
]
data_stream_flag_size = [
["", "TABLE 125: STATE-WISE COMPOSITION OF OUTSTANDING LIABILITIES - 1997 <s>(Contd.)</s>", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "(As at end-March)", "", "", "", "", "", ""],
["", "", "", "", "", "", "", "", "", "", "(<s>`</s> Billion)"],
["States", "Total", "Market", "NSSF", "WMA", "Loans", "Loans", "Loans", "Loans", "Loans", "Loans"],
["", "Internal", "Loans", "", "from", "from", "from", "from", "from", "from SBI", "from"],
["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"],
@ -230,9 +312,7 @@ data_stream_flag_size = [
["Uttar Pradesh", "80.62", "74.89", "-", "4.34", "1.34", "0.6", "-", "-0.21", "0.18", "0.03"],
["West Bengal", "34.23", "32.19", "-", "-", "2.04", "0.77", "-", "0.06", "-", "0.51"],
["NCT Delhi", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"],
["<s>2</s> Includes `2.45 crore outstanding under “Market Loan Suspense”.", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "445", "", "", "", "", "", ""]
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"]
]
data_lattice = [
@ -261,6 +341,38 @@ data_lattice_table_rotated = [
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
]
data_lattice_two_tables_1 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5"],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2"],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8"],
["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9"],
["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0"],
["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8"],
["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6"],
["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5"],
["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4"],
["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6"],
["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4"]
]
data_lattice_two_tables_2 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "8.8", "0.3", "20.1", "17.0", "45.6", "8.2"],
["Tamil Nadu", "2400", "29.9", "1.5", "8.5", "33.1", "22.3", "4.8"],
["Karnataka", "2399", "47.9", "2.5", "10.2", "18.8", "18.4", "2.3"],
["Andhra Pradesh", "2400", "66.4", "0.7", "6.8", "12.9", "11.4", "1.8"],
["Maharashtra", "2400", "41.3", "0.6", "14.1", "20.1", "21.6", "2.2"],
["Gujarat", "2390", "57.6", "0.1", "10.3", "16.5", "12.9", "2.7"],
["Madhya Pradesh", "2402", "58.7", "2.2", "6.6", "24.1", "5.3", "3.0"],
["Orissa", "2405", "50.0", "0.9", "8.1", "21.9", "15.1", "4.0"],
["West Bengal", "2293", "49.1", "4.8", "11.2", "16.8", "17.1", "1.1"],
["Uttar Pradesh", "2400", "67.3", "2.0", "3.1", "17.2", "7.7", "2.7"],
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
]
data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""],

View File

@ -56,6 +56,17 @@ def test_stream_table_rotated():
assert df.equals(tables[0].df)
def test_stream_two_tables():
df1 = pd.DataFrame(data_stream_two_tables_1)
df2 = pd.DataFrame(data_stream_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_stream_table_areas():
df = pd.DataFrame(data_stream_table_areas)
@ -111,6 +122,17 @@ def test_lattice_table_rotated():
assert df.equals(tables[0].df)
def test_lattice_two_tables():
df1 = pd.DataFrame(data_lattice_two_tables_1)
df2 = pd.DataFrame(data_lattice_two_tables_2)
filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename)
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas)