[MRG + 1] Make pep8 (#125)

* Make setup.py pep8

Add new line at end of file, fix bare except, remove unused import.

* Make tests/*.py pep8

Add some newlines at and of files and a visual indent.

* Make docs/*.py pep8

Fix block comments and add new lines at end of files.

* Make camelot/*.py pep8

Fixed unused import, a few weirdly ordered imports, a docstring typo and  many new lines at the end of lines.

* Fix imports

Fix import order and remove a couple more unused imports.

* Fix indents

Fix indentation (no opening delimiter alignment).

* Add newlines
pull/2/head
Oshawk 2018-10-05 12:25:43 +01:00 committed by Vinayak Mehta
parent 6e8079df84
commit 90aaba6eec
20 changed files with 107 additions and 111 deletions

View File

@ -2,6 +2,9 @@
import logging import logging
from .__version__ import __version__
from .io import read_pdf
# set up logging # set up logging
logger = logging.getLogger('camelot') logger = logging.getLogger('camelot')
@ -12,8 +15,3 @@ handler = logging.StreamHandler()
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
from .__version__ import __version__
from .io import read_pdf

View File

@ -8,4 +8,4 @@ __url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION)) __version__ = '.'.join(map(str, VERSION))
__author__ = 'Vinayak Mehta' __author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com' __author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License' __license__ = 'MIT License'

View File

@ -2,17 +2,18 @@
import logging import logging
logger = logging.getLogger('camelot')
logger.setLevel(logging.INFO)
import click import click
from . import __version__ from . import __version__
from .io import read_pdf from .io import read_pdf
logger = logging.getLogger('camelot')
logger.setLevel(logging.INFO)
class Config(object): class Config(object):
def __init__(self): def __init__(self):
self.config = {} self.config = {}
def set_config(self, key, value): def set_config(self, key, value):
@ -152,4 +153,4 @@ def stream(c, *args, **kwargs):
raise click.UsageError('Please specify output file path using --output') raise click.UsageError('Please specify output file path using --output')
if f is None: if f is None:
raise click.UsageError('Please specify output file format using --format') raise click.UsageError('Please specify output file format using --format')
tables.export(output, f=f, compress=compress) tables.export(output, f=f, compress=compress)

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import json
import zipfile import zipfile
import tempfile import tempfile
@ -519,4 +518,4 @@ class TableList(object):
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))

View File

@ -145,4 +145,4 @@ class PDFHandler(object):
for p in pages: for p in pages:
t = parser.extract_tables(p) t = parser.extract_tables(p)
tables.extend(t) tables.extend(t)
return TableList(tables) return TableList(tables)

View File

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
from itertools import groupby
from operator import itemgetter
import cv2 import cv2
import numpy as np import numpy as np
@ -40,10 +38,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if process_background: if process_background:
threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c) cv2.THRESH_BINARY, blocksize, c)
else: else:
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, threshold = cv2.adaptiveThreshold(
np.invert(gray), 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
return img, threshold return img, threshold
@ -197,4 +197,4 @@ def find_table_joints(contours, vertical, horizontal):
joint_coords.append((c1, c2)) joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords tables[(x, y + h, x + w, y)] = joint_coords
return tables return tables

View File

@ -89,4 +89,4 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, **kwargs) tables = p.parse(flavor=flavor, **kwargs)
return tables return tables

View File

@ -1,4 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .stream import Stream from .stream import Stream
from .lattice import Lattice from .lattice import Lattice

View File

@ -18,4 +18,4 @@ class BaseParser(object):
self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.horizontal_text = get_text_objects(self.layout, ltype="lh")
self.vertical_text = get_text_objects(self.layout, ltype="lv") self.vertical_text = get_text_objects(self.layout, ltype="lv")
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -201,8 +201,9 @@ class Lattice(BaseParser):
if 'ghostscript' in subprocess.check_output(['gs', '-version']).decode('utf-8').lower(): if 'ghostscript' in subprocess.check_output(['gs', '-version']).decode('utf-8').lower():
gs_call.insert(0, 'gs') gs_call.insert(0, 'gs')
else: else:
gs_call.insert(0, 'gsc') gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'), subprocess.call(
gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
def _generate_table_bbox(self): def _generate_table_bbox(self):
@ -339,10 +340,10 @@ class Lattice(BaseParser):
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), for table_idx, tk in enumerate(sorted(
key=lambda x: x[1], reverse=True)): self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table) _tables.append(table)
return _tables return _tables

View File

@ -116,7 +116,7 @@ class Stream(BaseParser):
row_y = t.y0 row_y = t.y0
temp.append(t) temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky __ = rows.pop(0) # hacky
return rows return rows
@staticmethod @staticmethod
@ -211,7 +211,7 @@ class Stream(BaseParser):
text = Stream._group_rows(text, row_close_tol=row_close_tol) text = Stream._group_rows(text, row_close_tol=row_close_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1) new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r] for r in text if len(r) == max(elements) for t in r]
cols.extend(Stream._merge_columns(sorted(new_cols))) cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols return cols
@ -357,10 +357,10 @@ class Stream(BaseParser):
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), for table_idx, tk in enumerate(sorted(
key=lambda x: x[1], reverse=True)): self.table_bbox.keys(), key=lambda x: x[1], reverse=True)):
cols, rows = self._generate_columns_and_rows(table_idx, tk) cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows) table = self._generate_table(table_idx, cols, rows)
_tables.append(table) _tables.append(table)
return _tables return _tables

View File

@ -41,16 +41,16 @@ def plot_table(table):
for cell in row: for cell in row:
if cell.left: if cell.left:
plt.plot([cell.lb[0], cell.lt[0]], plt.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]]) [cell.lb[1], cell.lt[1]])
if cell.right: if cell.right:
plt.plot([cell.rb[0], cell.rt[0]], plt.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]]) [cell.rb[1], cell.rt[1]])
if cell.top: if cell.top:
plt.plot([cell.lt[0], cell.rt[0]], plt.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]]) [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
plt.plot([cell.lb[0], cell.rb[0]], plt.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]]) [cell.lb[1], cell.rb[1]])
plt.show() plt.show()
@ -105,4 +105,4 @@ def plot_line(segments):
plt.plot([v[0], v[2]], [v[1], v[3]]) plt.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
plt.plot([h[0], h[2]], [h[1], h[3]]) plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show() plt.show()

View File

@ -1,5 +1,4 @@
from __future__ import division from __future__ import division
import os
import shutil import shutil
import tempfile import tempfile
import warnings import warnings
@ -14,7 +13,6 @@ from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical)
@ -278,8 +276,8 @@ def text_in_bbox(bbox, text):
lb = (bbox[0], bbox[1]) lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3]) rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
<= rt[1] + 2] <= rt[1] + 2]
return t_bbox return t_bbox
@ -640,4 +638,4 @@ def get_text_objects(layout, ltype="char", t=None):
t += get_text_objects(obj, ltype=ltype) t += get_text_objects(obj, ltype=ltype)
except AttributeError: except AttributeError:
pass pass
return t return t

View File

@ -10,21 +10,21 @@ class FlaskyStyle(Style):
styles = { styles = {
# No corresponding class for the following: # No corresponding class for the following:
#Text: "", # class: '' # Text: "", # class: ''
Whitespace: "underline #f8f8f8", # class: 'w' Whitespace: "underline #f8f8f8", # class: 'w'
Error: "#a40000 border:#ef2929", # class: 'err' Error: "#a40000 border:#ef2929", # class: 'err'
Other: "#000000", # class 'x' Other: "#000000", # class 'x'
Comment: "italic #8f5902", # class: 'c' Comment: "italic #8f5902", # class: 'c'
Comment.Preproc: "noitalic", # class: 'cp' Comment.Preproc: "noitalic", # class: 'cp'
Keyword: "bold #004461", # class: 'k' Keyword: "bold #004461", # class: 'k'
Keyword.Constant: "bold #004461", # class: 'kc' Keyword.Constant: "bold #004461", # class: 'kc'
Keyword.Declaration: "bold #004461", # class: 'kd' Keyword.Declaration: "bold #004461", # class: 'kd'
Keyword.Namespace: "bold #004461", # class: 'kn' Keyword.Namespace: "bold #004461", # class: 'kn'
Keyword.Pseudo: "bold #004461", # class: 'kp' Keyword.Pseudo: "bold #004461", # class: 'kp'
Keyword.Reserved: "bold #004461", # class: 'kr' Keyword.Reserved: "bold #004461", # class: 'kr'
Keyword.Type: "bold #004461", # class: 'kt' Keyword.Type: "bold #004461", # class: 'kt'
Operator: "#582800", # class: 'o' Operator: "#582800", # class: 'o'
Operator.Word: "bold #004461", # class: 'ow' - like keywords Operator.Word: "bold #004461", # class: 'ow' - like keywords
@ -34,53 +34,53 @@ class FlaskyStyle(Style):
# because special names such as Name.Class, Name.Function, etc. # because special names such as Name.Class, Name.Function, etc.
# are not recognized as such later in the parsing, we choose them # are not recognized as such later in the parsing, we choose them
# to look the same as ordinary variables. # to look the same as ordinary variables.
Name: "#000000", # class: 'n' Name: "#000000", # class: 'n'
Name.Attribute: "#c4a000", # class: 'na' - to be revised Name.Attribute: "#c4a000", # class: 'na' - to be revised
Name.Builtin: "#004461", # class: 'nb' Name.Builtin: "#004461", # class: 'nb'
Name.Builtin.Pseudo: "#3465a4", # class: 'bp' Name.Builtin.Pseudo: "#3465a4", # class: 'bp'
Name.Class: "#000000", # class: 'nc' - to be revised Name.Class: "#000000", # class: 'nc' - to be revised
Name.Constant: "#000000", # class: 'no' - to be revised Name.Constant: "#000000", # class: 'no' - to be revised
Name.Decorator: "#888", # class: 'nd' - to be revised Name.Decorator: "#888", # class: 'nd' - to be revised
Name.Entity: "#ce5c00", # class: 'ni' Name.Entity: "#ce5c00", # class: 'ni'
Name.Exception: "bold #cc0000", # class: 'ne' Name.Exception: "bold #cc0000", # class: 'ne'
Name.Function: "#000000", # class: 'nf' Name.Function: "#000000", # class: 'nf'
Name.Property: "#000000", # class: 'py' Name.Property: "#000000", # class: 'py'
Name.Label: "#f57900", # class: 'nl' Name.Label: "#f57900", # class: 'nl'
Name.Namespace: "#000000", # class: 'nn' - to be revised Name.Namespace: "#000000", # class: 'nn' - to be revised
Name.Other: "#000000", # class: 'nx' Name.Other: "#000000", # class: 'nx'
Name.Tag: "bold #004461", # class: 'nt' - like a keyword Name.Tag: "bold #004461", # class: 'nt' - like a keyword
Name.Variable: "#000000", # class: 'nv' - to be revised Name.Variable: "#000000", # class: 'nv' - to be revised
Name.Variable.Class: "#000000", # class: 'vc' - to be revised Name.Variable.Class: "#000000", # class: 'vc' - to be revised
Name.Variable.Global: "#000000", # class: 'vg' - to be revised Name.Variable.Global: "#000000", # class: 'vg' - to be revised
Name.Variable.Instance: "#000000", # class: 'vi' - to be revised Name.Variable.Instance: "#000000", # class: 'vi' - to be revised
Number: "#990000", # class: 'm' Number: "#990000", # class: 'm'
Literal: "#000000", # class: 'l' Literal: "#000000", # class: 'l'
Literal.Date: "#000000", # class: 'ld' Literal.Date: "#000000", # class: 'ld'
String: "#4e9a06", # class: 's' String: "#4e9a06", # class: 's'
String.Backtick: "#4e9a06", # class: 'sb' String.Backtick: "#4e9a06", # class: 'sb'
String.Char: "#4e9a06", # class: 'sc' String.Char: "#4e9a06", # class: 'sc'
String.Doc: "italic #8f5902", # class: 'sd' - like a comment String.Doc: "italic #8f5902", # class: 'sd' - like a comment
String.Double: "#4e9a06", # class: 's2' String.Double: "#4e9a06", # class: 's2'
String.Escape: "#4e9a06", # class: 'se' String.Escape: "#4e9a06", # class: 'se'
String.Heredoc: "#4e9a06", # class: 'sh' String.Heredoc: "#4e9a06", # class: 'sh'
String.Interpol: "#4e9a06", # class: 'si' String.Interpol: "#4e9a06", # class: 'si'
String.Other: "#4e9a06", # class: 'sx' String.Other: "#4e9a06", # class: 'sx'
String.Regex: "#4e9a06", # class: 'sr' String.Regex: "#4e9a06", # class: 'sr'
String.Single: "#4e9a06", # class: 's1' String.Single: "#4e9a06", # class: 's1'
String.Symbol: "#4e9a06", # class: 'ss' String.Symbol: "#4e9a06", # class: 'ss'
Generic: "#000000", # class: 'g' Generic: "#000000", # class: 'g'
Generic.Deleted: "#a40000", # class: 'gd' Generic.Deleted: "#a40000", # class: 'gd'
Generic.Emph: "italic #000000", # class: 'ge' Generic.Emph: "italic #000000", # class: 'ge'
Generic.Error: "#ef2929", # class: 'gr' Generic.Error: "#ef2929", # class: 'gr'
Generic.Heading: "bold #000080", # class: 'gh' Generic.Heading: "bold #000080", # class: 'gh'
Generic.Inserted: "#00A000", # class: 'gi' Generic.Inserted: "#00A000", # class: 'gi'
Generic.Output: "#888", # class: 'go' Generic.Output: "#888", # class: 'go'
Generic.Prompt: "#745334", # class: 'gp' Generic.Prompt: "#745334", # class: 'gp'
Generic.Strong: "bold #000000", # class: 'gs' Generic.Strong: "bold #000000", # class: 'gs'
Generic.Subheading: "bold #800080", # class: 'gu' Generic.Subheading: "bold #800080", # class: 'gu'
Generic.Traceback: "bold #a40000", # class: 'gt' Generic.Traceback: "bold #a40000", # class: 'gt'
} }

View File

@ -358,4 +358,4 @@ texinfo_documents = [
intersphinx_mapping = { intersphinx_mapping = {
'https://docs.python.org/2': None, 'https://docs.python.org/2': None,
'http://pandas.pydata.org/pandas-docs/stable': None 'http://pandas.pydata.org/pandas-docs/stable': None
} }

View File

@ -2,7 +2,6 @@
import os import os
from setuptools import find_packages from setuptools import find_packages
from pkg_resources import parse_version
here = os.path.abspath(os.path.dirname(__file__)) here = os.path.abspath(os.path.dirname(__file__))
@ -56,11 +55,11 @@ def setup_package():
try: try:
from setuptools import setup from setuptools import setup
except: except ImportError:
from distutils.core import setup from distutils.core import setup
setup(**metadata) setup(**metadata)
if __name__ == '__main__': if __name__ == '__main__':
setup_package() setup_package()

View File

@ -373,4 +373,4 @@ data_lattice_shift_text_right_bottom = [
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], ["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["Knowledge &Practices on HTN &DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"] ["Knowledge &Practices on HTN &DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
] ]

View File

@ -76,4 +76,4 @@ def test_cli_output_format():
# zip # zip
result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'), result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'),
'stream', infile]) 'stream', infile])
assert result.exit_code == 0 assert result.exit_code == 0

View File

@ -82,8 +82,8 @@ def test_stream_flag_size():
def test_lattice(): def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)
filename = os.path.join(testdir, filename = os.path.join(
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
tables = camelot.read_pdf(filename, pages="2") tables = camelot.read_pdf(filename, pages="2")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -137,4 +137,4 @@ def test_lattice_shift_text():
assert df_disable.equals(tables[0].df) assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
assert df_rb.equals(tables[0].df) assert df_rb.equals(tables[0].df)

View File

@ -50,4 +50,4 @@ def test_no_tables_found():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
except Exception as e: except Exception as e:
assert type(e).__name__ == 'UserWarning' assert type(e).__name__ == 'UserWarning'
assert str(e) == 'No tables found on page-1' assert str(e) == 'No tables found on page-1'