Add Python 3 compatibility (#109)

* Add python3 compat

* Update .gitignore

* Update .gitignore again

* Remove debugging return

* Add unicode_literals import

* Bump version

* Add python3-tk note
pull/2/head v0.2.0
Vinayak Mehta 2018-09-28 21:58:29 +05:30 committed by GitHub
parent 82463e10b4
commit fc0542bd3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 105 additions and 132 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@ __pycache__/
build/
dist/
*.egg-info/
.eggs/
.coverage
coverage.xml

View File

@ -1,8 +1,9 @@
language: python
python:
- "2.7"
- "3.6"
before_install:
- sudo apt-get install python-tk ghostscript
- sudo apt-get install python-tk python3-tk ghostscript
install:
- pip install ".[dev]"
script:

View File

@ -43,14 +43,7 @@
There's a [command-line interface](https://camelot-py.readthedocs.io/en/latest/user/cli.html) too!
---
**Note:** Camelot only works with:
- Python 2, with Python 3 support [on the way](https://github.com/socialcopsdev/camelot/issues/81).
- Text-based PDFs and not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer, then your PDF is text-based. Support for image-based PDFs using OCR is [planned](https://github.com/socialcopsdev/camelot/issues/101).
---
**Note:** Camelot only works with text-based PDFs and not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer, then your PDF is text-based.
## Why Camelot?
@ -84,7 +77,7 @@ $ cd camelot
$ pip install .
</pre>
Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't want to affect your global Python installation.
**Note:** Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't want to affect your global Python installation.
## Documentation

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
VERSION = (0, 1, 2)
VERSION = (0, 2, 0)
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'

View File

@ -13,7 +13,7 @@ from .base import BaseParser
from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace, setup_logging, encode_)
compute_whitespace, setup_logging)
from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints)
@ -177,7 +177,7 @@ class Lattice(BaseParser):
gs_call = [
"-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
if "ghostscript" in subprocess.check_output(["gs", "-version"]).decode('utf-8').lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
@ -284,7 +284,6 @@ class Lattice(BaseParser):
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
data = table.data
data = encode_(data)
table.df = pd.DataFrame(data)
table.shape = table.df.shape

View File

@ -10,7 +10,7 @@ import pandas as pd
from .base import BaseParser
from ..core import Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace, setup_logging, encode_)
compute_whitespace, setup_logging)
logger = setup_logging(__name__)
@ -323,7 +323,6 @@ class Stream(BaseParser):
accuracy = compute_accuracy([[100, pos_errors]])
data = table.data
data = encode_(data)
table.df = pd.DataFrame(data)
table.shape = table.df.shape

View File

@ -560,7 +560,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
text = t.get_text().strip('\n')
text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1])
@ -669,22 +669,6 @@ def remove_empty(d):
return d
def encode_(ar):
"""Encodes two-dimensional list into unicode.
Parameters
----------
ar : list
Returns
-------
ar : list
"""
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
"""Returns a PDFMiner LTPage object and page dimension of a single
@ -709,7 +693,7 @@ def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
Dimension of pdf page in the form (width, height).
"""
with open(filename, 'r') as f:
with open(filename, 'rb') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:

View File

@ -55,13 +55,7 @@ Release v\ |version|. (:ref:`Installation <install>`)
There's a :ref:`command-line interface <cli>` too!
.. note:: Camelot only works with:
- Python 2, with **Python 3** support `on the way`_.
- Text-based PDFs and not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer, then your PDF is text-based. Support for image-based PDFs using **OCR** is `planned`_.
.. _on the way: https://github.com/socialcopsdev/camelot/issues/81
.. _planned: https://github.com/socialcopsdev/camelot/issues/101
.. note:: Camelot only works with text-based PDFs and not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer, then your PDF is text-based.
Why Camelot?
------------

View File

@ -14,6 +14,8 @@ For Ubuntu::
$ apt install python-tk ghostscript
.. note:: For Python 3, install python3-tk.
For macOS::
$ brew install tcl-tk ghostscript

View File

@ -3,5 +3,5 @@ matplotlib==2.2.3
numpy==1.13.3
opencv-python==3.4.2.17
pandas==0.23.4
pdfminer==20140328
pdfminer.six==20170720
PyPDF2==1.26.0

View File

@ -48,7 +48,8 @@ def setup_package():
# Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7'
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.6'
])
try:

View File

@ -1,7 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
data_stream_table_rotated = [
["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""],
["", "", "", "Any", "", "", "", "", "", "", "Other", "Any","", "", "", "Not", "", "Number"],
["", "", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"],
@ -18,9 +21,10 @@ data_stream_table_rotated = [
["", "Fourth", "73.9", "52.3", "32.0", "0.5", "12.5", "0.6", "0.2", "6.3", "0.2", "21.6", "11.5", "9.9", "0.2", "26.1", "100.0", "908"],
["", "Highest", "78.3", "44.4", "19.5", "1.0", "9.7", "1.4", "0.0", "12.7", "0.0", "33.8", "18.2", "15.6", "0.0", "21.7", "100.0", "733"],
["", "Number of living children", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["", "No children", "25.1", "7.6", "0.3", "0.5", "2.0", "0.0",
"0.0", "4.8", "0.0", "17.5", "9.0", "8.5", "0.0", "74.9", "100.0", "563"],
["", "1 child", "66.5", "32.1", "3.7", "0.7", "20.1", "0.7", "0.1", "6.9", "0.0", "34.3", "18.9", "15.2", "0.3", "33.5", "100.0", "1,190"],
["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["\x18\x18", "1 son", "66.8", "33.2", "4.1", "0.7", "21.1", "0.5", "0.3", "6.6", "0.0", "33.5", "21.2", "12.3", "0.0", "33.2", "100.0", "672"],
["", "No sons", "66.1", "30.7", "3.1", "0.6", "18.8", "0.8", "0.0", "7.3", "0.0", "35.4", "15.8", "19.0", "0.6", "33.9", "100.0", "517"],
["", "2 children", "81.6", "60.5", "41.8", "0.9", "11.6", "0.8", "0.3", "4.8", "0.2", "21.1", "12.2", "8.3", "0.6", "18.4", "100.0", "1,576"],
["", "1 or more sons", "83.7", "64.2", "46.4", "0.9", "10.8", "0.8", "0.4", "4.8", "0.1", "19.5", "11.1", "7.6", "0.7", "16.3", "100.0", "1,268"],
@ -31,7 +35,8 @@ data_stream_table_rotated = [
["", "4+ children", "74.3", "58.1", "45.1", "0.6", "8.7", "0.6", "0.7", "2.4", "0.0", "16.1", "9.9", "5.4", "0.8", "25.7", "100.0", "944"],
["", "1 or more sons", "73.9", "58.2", "46.0", "0.7", "8.3", "0.7", "0.7", "1.9", "0.0", "15.7", "9.4", "5.5", "0.8", "26.1", "100.0", "901"],
["", "No sons", "(82.1)", "(57.3)", "(25.6)", "(0.0)", "(17.8)", "(0.0)", "(0.0)", "(13.9)", "(0.0)", "(24.8)", "(21.3)", "(3.5)", "(0.0)", "(17.9)", "100.0", "43"],
["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["", "Total", "71.2", "49.9", "32.2",
"0.7", "11.7", "0.6", "0.3", "4.3", "0.1", "21.3", "12.3", "8.4", "0.5", "28.8", "100.0", "5,234"],
["", "NFHS-2 (1998-99)", "66.6", "47.3", "32.0", "1.8", "9.2", "1.4", "na", "2.9", "na", "na", "8.7", "9.8", "na", "33.4", "100.0", "4,116"],
["", "NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"],
["", "", "Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""],
@ -42,7 +47,6 @@ data_stream_table_rotated = [
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
]
data_stream_table_area_single = [
["","One Withholding"],
["Payroll Period","Allowance"],
@ -57,7 +61,6 @@ data_stream_table_area_single = [
["(each day of the payroll period)",""]
]
data_stream_columns = [
["Clave", "Nombre Entidad", "Clave", "Nombre Municipio", "Clave", "Nombre Localidad"],
["Entidad", "", "Municipio", "", "Localidad", ""],
@ -67,15 +70,15 @@ data_stream_columns = [
["01", "Aguascalientes", "001", "Aguascalientes", "0102", "Los Arbolitos [Rancho]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0104", "Ardillas de Abajo (Las Ardillas)"],
["01", "Aguascalientes", "001", "Aguascalientes", "0106", "Arellano"],
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["01", "Aguascalientes", "001", "Aguascalientes", "0112", "Baj\xedo los V\xe1zquez"],
["01", "Aguascalientes", "001", "Aguascalientes", "0113", "Baj\xedo de Montoro"],
["01", "Aguascalientes", "001", "Aguascalientes", "0114", "Residencial San Nicol\xe1s [Ba\xf1os la Cantera]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0120", "Buenavista de Pe\xf1uelas"],
["01", "Aguascalientes", "001", "Aguascalientes", "0121", "Cabecita 3 Mar\xedas (Rancho Nuevo)"],
["01", "Aguascalientes", "001", "Aguascalientes", "0125", "Ca\xf1ada Grande de Cotorina"],
["01", "Aguascalientes", "001", "Aguascalientes", "0126", "Ca\xf1ada Honda [Estaci\xf3n]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0127", "Los Ca\xf1os"],
["01", "Aguascalientes", "001", "Aguascalientes", "0128", "El Cari\xf1\xe1n"],
["01", "Aguascalientes", "001", "Aguascalientes", "0129", "El Carmen [Granja]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0135", "El Cedazo (Cedazo de San Antonio)"],
["01", "Aguascalientes", "001", "Aguascalientes", "0138", "Centro de Arriba (El Taray)"],
@ -86,25 +89,24 @@ data_stream_columns = [
["01", "Aguascalientes", "001", "Aguascalientes", "0157", "Cotorina de Abajo"],
["01", "Aguascalientes", "001", "Aguascalientes", "0162", "Coyotes"],
["01", "Aguascalientes", "001", "Aguascalientes", "0166", "La Huerta (La Cruz)"],
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["01", "Aguascalientes", "001", "Aguascalientes", "0170", "Cuauht\xe9moc (Las Palomas)"],
["01", "Aguascalientes", "001", "Aguascalientes", "0171", "Los Cuervos (Los Ojos de Agua)"],
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0172", "San Jos\xe9 [Granja]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0176", "La Chiripa"],
["01", "Aguascalientes", "001", "Aguascalientes", "0182", "Dolores"],
["01", "Aguascalientes", "001", "Aguascalientes", "0183", "Los Dolores"],
["01", "Aguascalientes", "001", "Aguascalientes", "0190", "El Duraznillo"],
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["01", "Aguascalientes", "001", "Aguascalientes", "0191", "Los Dur\xf3n"],
["01", "Aguascalientes", "001", "Aguascalientes", "0197", "La Escondida"],
["01", "Aguascalientes", "001", "Aguascalientes", "0201", "Brande Vin [Bodegas]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0207", "Valle Redondo"],
["01", "Aguascalientes", "001", "Aguascalientes", "0209", "La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0212", "Lomas del Gachup\xedn"],
["01", "Aguascalientes", "001", "Aguascalientes", "0213", "El Carmen (Gallinas G\xfceras) [Rancho]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0216", "La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"]
]
data_lattice = [
["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""],
["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"],
@ -115,7 +117,6 @@ data_lattice = [
["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"]
]
data_lattice_table_rotated = [
["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""],
["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"],
@ -132,7 +133,6 @@ data_lattice_table_rotated = [
["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"]
]
data_lattice_process_background = [
["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"],
["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"],
@ -144,7 +144,6 @@ data_lattice_process_background = [
["Total","","47","92","11.81","22,455","19,584","10,644"]
]
data_lattice_copy_text = [
["Plan Type","County","Plan Name","Totals"],
["GMC","Sacramento","Anthem Blue Cross","164,380"],