diff --git a/camelot/core.py b/camelot/core.py index d25a810..87cb6ae 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -4,6 +4,9 @@ import numpy as np class Cell(object): + """ + + """ def __init__(self, x1, y1, x2, y2): self.x1 = x1 self.y1 = y1 @@ -27,23 +30,56 @@ class Cell(object): pass def add_text(self, text): + """ + + Parameters + ---------- + text + """ self.text = ''.join([self.text, text]) def get_text(self): + """ + + Returns + ------- + + """ return self.text def add_object(self, t_object): + """ + + Parameters + ---------- + t_object + """ self.text_objects.append(t_object) def get_objects(self): + """ + + Returns + ------- + + """ return self.text_objects def get_bounded_edges(self): + """ + + Returns + ------- + + """ self.bounded_edges = self.top + self.bottom + self.left + self.right return self.bounded_edges class Table(object): + """ + + """ def __init__(self, cols, rows): self.cols = cols self.rows = rows @@ -60,6 +96,12 @@ class Table(object): return '<{} shape={}>'.format(self.__class__.__name__, self._shape) def set_all_edges(self): + """ + + Returns + ------- + + """ for r in range(len(self.rows)): for c in range(len(self.cols)): self.cells[r][c].left = True @@ -69,6 +111,12 @@ class Table(object): return self def set_border_edges(self): + """ + + Returns + ------- + + """ for r in range(len(self.rows)): self.cells[r][0].left = True self.cells[r][len(self.cols) - 1].right = True @@ -78,6 +126,18 @@ class Table(object): return self def set_edges(self, vertical, horizontal, jtol=2): + """ + + Parameters + ---------- + vertical + horizontal + jtol + + Returns + ------- + + """ for v in vertical: # find closest x coord # iterate over y coords and find closest points @@ -185,6 +245,12 @@ class Table(object): return self def set_spanning(self): + """ + + Returns + ------- + + """ for r in range(len(self.rows)): for c in range(len(self.cols)): bound = self.cells[r][c].get_bounded_edges() @@ -225,6 +291,12 @@ class Table(object): @property def data(self): + """ + + Returns + ------- + + """ d = [] for r in range(len(self.rows)): d.append([self.cells[r][c].get_text().strip() @@ -233,6 +305,12 @@ class Table(object): @property def df(self): + """ + + Returns + ------- + + """ return self._df @df.setter @@ -241,6 +319,12 @@ class Table(object): @property def shape(self): + """ + + Returns + ------- + + """ return self._shape @shape.setter @@ -249,6 +333,12 @@ class Table(object): @property def accuracy(self): + """ + + Returns + ------- + + """ return self._accuracy @accuracy.setter @@ -257,6 +347,12 @@ class Table(object): @property def whitespace(self): + """ + + Returns + ------- + + """ return self._whitespace @whitespace.setter @@ -265,6 +361,12 @@ class Table(object): @property def order(self): + """ + + Returns + ------- + + """ return self._order @order.setter @@ -273,6 +375,12 @@ class Table(object): @property def page(self): + """ + + Returns + ------- + + """ return self._page @page.setter @@ -281,6 +389,12 @@ class Table(object): @property def parsing_report(self): + """ + + Returns + ------- + + """ # pretty? report = { 'accuracy': self._accuracy, @@ -292,6 +406,9 @@ class Table(object): class TableList(list): + """ + + """ def __init__(self, tables): self._tables = tables @@ -307,6 +424,9 @@ class TableList(list): class Geometry(object): + """ + + """ def __init__(self): self._text = [] self._images = () @@ -315,6 +435,12 @@ class Geometry(object): @property def text(self): + """ + + Returns + ------- + + """ return self._text @text.setter @@ -323,6 +449,12 @@ class Geometry(object): @property def images(self): + """ + + Returns + ------- + + """ return self._images @images.setter @@ -331,6 +463,12 @@ class Geometry(object): @property def segments(self): + """ + + Returns + ------- + + """ return self._segments @segments.setter @@ -339,6 +477,12 @@ class Geometry(object): @property def tables(self): + """ + + Returns + ------- + + """ return self._tables @tables.setter @@ -347,6 +491,9 @@ class Geometry(object): class GeometryList(object): + """ + + """ def __init__(self, geometry): self._text = [g.text for g in geometry] self._images = [g.images for g in geometry] @@ -363,16 +510,40 @@ class GeometryList(object): @property def text(self): + """ + + Returns + ------- + + """ return self._text @property def images(self): + """ + + Returns + ------- + + """ return self._images @property def segments(self): + """ + + Returns + ------- + + """ return self._segments @property def tables(self): + """ + + Returns + ------- + + """ return self._tables \ No newline at end of file diff --git a/camelot/handlers.py b/camelot/handlers.py index dce3b05..c4bcfd8 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -9,6 +9,9 @@ from .utils import get_page_layout, get_text_objects, get_rotation class PDFHandler(object): + """ + + """ def __init__(self, filename, pages='1'): self.filename = filename if not self.filename.endswith('.pdf'): @@ -71,6 +74,17 @@ class PDFHandler(object): outfile.write(f) def parse(self, mesh=False, **kwargs): + """ + + Parameters + ---------- + mesh + kwargs + + Returns + ------- + + """ for p in self.pages: self.__save_page(self.filename, p, self.temp) pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index fc284e4..a1526ef 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -8,6 +8,19 @@ from .utils import merge_tuples def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): + """ + + Parameters + ---------- + imagename + invert + blocksize + c + + Returns + ------- + + """ img = cv2.imread(imagename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) @@ -21,6 +34,19 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): def find_lines(threshold, direction='horizontal', scale=15, iterations=0): + """ + + Parameters + ---------- + threshold + direction + scale + iterations + + Returns + ------- + + """ lines = [] if direction == 'vertical': @@ -57,6 +83,17 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0): def find_table_contours(vertical, horizontal): + """ + + Parameters + ---------- + vertical + horizontal + + Returns + ------- + + """ mask = vertical + horizontal try: @@ -76,6 +113,18 @@ def find_table_contours(vertical, horizontal): def find_table_joints(contours, vertical, horizontal): + """ + + Parameters + ---------- + contours + vertical + horizontal + + Returns + ------- + + """ joints = np.bitwise_and(vertical, horizontal) tables = {} for c in contours: @@ -100,6 +149,17 @@ def find_table_joints(contours, vertical, horizontal): def remove_lines(threshold, line_scale=15): + """ + + Parameters + ---------- + threshold + line_scale + + Returns + ------- + + """ size = threshold.shape[0] // line_scale vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) @@ -117,6 +177,17 @@ def remove_lines(threshold, line_scale=15): def find_cuts(threshold, char_scale=200): + """ + + Parameters + ---------- + threshold + char_scale + + Returns + ------- + + """ size = threshold.shape[0] // char_scale char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) diff --git a/camelot/io.py b/camelot/io.py index c2b319b..54f8ef5 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -2,6 +2,19 @@ from .handlers import PDFHandler def read_pdf(filepath, pages='1', mesh=False, **kwargs): + """ + + Parameters + ---------- + filepath + pages + mesh + kwargs + + Returns + ------- + + """ # explicit type conversion p = PDFHandler(filepath, pages) tables, __ = p.parse(mesh=mesh, **kwargs) diff --git a/camelot/parsers.py b/camelot/parsers.py index 50bd8a5..c9b5d2a 100644 --- a/camelot/parsers.py +++ b/camelot/parsers.py @@ -32,6 +32,9 @@ copy_reg.pickle(types.MethodType, _reduce_method) class Stream: + """ + + """ def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, debug=False): @@ -134,6 +137,16 @@ class Stream: return cols def extract_tables(self, pdfname): + """ + + Parameters + ---------- + pdfname + + Returns + ------- + + """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) lttextlh = get_text_objects(layout, ltype="lh") @@ -265,6 +278,9 @@ class Stream: class Lattice: + """ + + """ def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2], blocksize=15, threshold_constant=-2, scale=15, iterations=0, invert=False, margins=(1.0, 0.5, 0.1), split_text=False, @@ -328,6 +344,16 @@ class Lattice: return t def extract_tables(self, pdfname): + """ + + Parameters + ---------- + pdfname + + Returns + ------- + + """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) lttextlh = get_text_objects(layout, ltype="lh") diff --git a/camelot/plot.py b/camelot/plot.py index 0b37ed9..793764e 100644 --- a/camelot/plot.py +++ b/camelot/plot.py @@ -6,6 +6,16 @@ from .handlers import PDFHandler def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): + """ + + Parameters + ---------- + filepath + pages + mesh + geometry_type + kwargs + """ # explicit type conversion p = PDFHandler(filepath, pages) kwargs.update({'debug': geometry_type}) diff --git a/camelot/utils.py b/camelot/utils.py index df82a8a..c6b1705 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -19,16 +19,52 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, def translate(x1, x2): + """ + + Parameters + ---------- + x1 + x2 + + Returns + ------- + + """ x2 += x1 return x2 def scale(x, s): + """ + + Parameters + ---------- + x + s + + Returns + ------- + + """ x *= s return x def rotate(x1, y1, x2, y2, angle): + """ + + Parameters + ---------- + x1 + y1 + x2 + y2 + angle + + Returns + ------- + + """ s = np.sin(angle) c = np.cos(angle) x2 = translate(-x1, x2) @@ -41,6 +77,17 @@ def rotate(x1, y1, x2, y2, angle): def scale_to_image(k, factors): + """ + + Parameters + ---------- + k + factors + + Returns + ------- + + """ x1, y1, x2, y2 = k scaling_factor_x, scaling_factor_y, pdf_y = factors x1 = scale(x1, scaling_factor_x) @@ -52,6 +99,19 @@ def scale_to_image(k, factors): def scale_to_pdf(tables, v_segments, h_segments, factors): + """ + + Parameters + ---------- + tables + v_segments + h_segments + factors + + Returns + ------- + + """ scaling_factor_x, scaling_factor_y, img_y = factors tables_new = {} for k in tables.keys(): @@ -84,6 +144,16 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): def setup_logging(log_filepath): + """ + + Parameters + ---------- + log_filepath + + Returns + ------- + + """ logger = logging.getLogger("app_logger") logger.setLevel(logging.DEBUG) # Log File Handler (Associating one log file per webservice run) @@ -105,6 +175,18 @@ def setup_logging(log_filepath): def get_rotation(lttextlh, lttextlv, ltchar): + """ + + Parameters + ---------- + lttextlh + lttextlv + ltchar + + Returns + ------- + + """ rotation = '' hlen = len([t for t in lttextlh if t.get_text().strip()]) vlen = len([t for t in lttextlv if t.get_text().strip()]) @@ -116,6 +198,18 @@ def get_rotation(lttextlh, lttextlv, ltchar): def segments_bbox(bbox, v_segments, h_segments): + """ + + Parameters + ---------- + bbox + v_segments + h_segments + + Returns + ------- + + """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) v_s = [v for v in v_segments if v[1] > lb[1] - 2 and @@ -126,6 +220,17 @@ def segments_bbox(bbox, v_segments, h_segments): def text_in_bbox(bbox, text): + """ + + Parameters + ---------- + bbox + text + + Returns + ------- + + """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 @@ -135,6 +240,17 @@ def text_in_bbox(bbox, text): def remove_close_values(ar, mtol=2): + """ + + Parameters + ---------- + ar + mtol + + Returns + ------- + + """ ret = [] for a in ar: if not ret: @@ -149,6 +265,17 @@ def remove_close_values(ar, mtol=2): def merge_close_values(ar, mtol=2): + """ + + Parameters + ---------- + ar + mtol + + Returns + ------- + + """ ret = [] for a in ar: if not ret: @@ -164,6 +291,17 @@ def merge_close_values(ar, mtol=2): def flag_on_size(textline, direction): + """ + + Parameters + ---------- + textline + direction + + Returns + ------- + + """ if direction == 'horizontal': d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] elif direction == 'vertical': @@ -190,6 +328,19 @@ def flag_on_size(textline, direction): def split_textline(table, textline, direction, flag_size=True): + """ + + Parameters + ---------- + table + textline + direction + flag_size + + Returns + ------- + + """ idx = 0 cut_text = [] bbox = textline.bbox @@ -241,6 +392,20 @@ def split_textline(table, textline, direction, flag_size=True): def get_table_index(table, t, direction, split_text=False, flag_size=True): + """ + + Parameters + ---------- + table + t + direction + split_text + flag_size + + Returns + ------- + + """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and @@ -284,6 +449,16 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): def compute_accuracy(error_weights): + """ + + Parameters + ---------- + error_weights + + Returns + ------- + + """ SCORE_VAL = 100 try: score = 0 @@ -299,6 +474,16 @@ def compute_accuracy(error_weights): def remove_empty(d): + """ + + Parameters + ---------- + d + + Returns + ------- + + """ for i, row in enumerate(d): if row == [''] * len(row): d.pop(i) @@ -309,6 +494,16 @@ def remove_empty(d): def count_empty(d): + """ + + Parameters + ---------- + d + + Returns + ------- + + """ empty_p = 0 r_nempty_cells, c_nempty_cells = [], [] for i in d: @@ -334,11 +529,33 @@ def count_empty(d): def encode_(ar): + """ + + Parameters + ---------- + ar + + Returns + ------- + + """ ar = [[r.encode('utf-8') for r in row] for row in ar] return ar def get_text_objects(layout, ltype="char", t=None): + """ + + Parameters + ---------- + layout + ltype + t + + Returns + ------- + + """ if ltype == "char": LTObject = LTChar elif ltype == "lh": @@ -360,6 +577,21 @@ def get_text_objects(layout, ltype="char", t=None): def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True): + """ + + Parameters + ---------- + pname + char_margin + line_margin + word_margin + detect_vertical + all_texts + + Returns + ------- + + """ with open(pname, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) @@ -383,6 +615,12 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, def merge_tuples(tuples): + """ + + Parameters + ---------- + tuples + """ merged = list(tuples[0]) for s, e in tuples: if s <= merged[1]: