From c689735da2cf6cbf3b0626a1179936ae072d6651 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 4 Sep 2018 03:49:43 +0530 Subject: [PATCH] Move cell and table to core --- camelot/__init__.py | 7 +- camelot/__version__.py | 1 + camelot/cell.py | 128 ------------------------------- camelot/cli.py | 0 camelot/{table.py => core.py} | 139 +++++++++++++++++++++++++++++++++- camelot/lattice.py | 2 +- camelot/stream.py | 2 +- 7 files changed, 144 insertions(+), 135 deletions(-) create mode 100644 camelot/__version__.py delete mode 100644 camelot/cell.py create mode 100644 camelot/cli.py rename camelot/{table.py => core.py} (73%) diff --git a/camelot/__init__.py b/camelot/__init__.py index b9b5f18..25dd606 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,3 +1,6 @@ -__version__ = '1.2.0' +from .core import * +from .__version__ import __version__ -__all__ = ['pdf', 'lattice', 'stream'] + +def read_pdf(filepath, pages='1', grid=True): + pass \ No newline at end of file diff --git a/camelot/__version__.py b/camelot/__version__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/camelot/__version__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/camelot/cell.py b/camelot/cell.py deleted file mode 100644 index 8dfe8d3..0000000 --- a/camelot/cell.py +++ /dev/null @@ -1,128 +0,0 @@ -class Cell: - """Cell. - Defines a cell object with coordinates relative to a left-bottom - origin, which is also PDFMiner's coordinate space. - - Parameters - ---------- - x1 : float - x-coordinate of left-bottom point. - - y1 : float - y-coordinate of left-bottom point. - - x2 : float - x-coordinate of right-top point. - - y2 : float - y-coordinate of right-top point. - - Attributes - ---------- - lb : tuple - Tuple representing left-bottom coordinates. - - lt : tuple - Tuple representing left-top coordinates. - - rb : tuple - Tuple representing right-bottom coordinates. - - rt : tuple - Tuple representing right-top coordinates. - - bbox : tuple - Tuple representing the cell's bounding box using the - lower-bottom and right-top coordinates. - - left : bool - Whether or not cell is bounded on the left. - - right : bool - Whether or not cell is bounded on the right. - - top : bool - Whether or not cell is bounded on the top. - - bottom : bool - Whether or not cell is bounded on the bottom. - - text_objects : list - List of text objects assigned to cell. - - text : string - Text assigned to cell. - - spanning_h : bool - Whether or not cell spans/extends horizontally. - - spanning_v : bool - Whether or not cell spans/extends vertically. - """ - - def __init__(self, x1, y1, x2, y2): - - self.x1 = x1 - self.y1 = y1 - self.x2 = x2 - self.y2 = y2 - self.lb = (x1, y1) - self.lt = (x1, y2) - self.rb = (x2, y1) - self.rt = (x2, y2) - self.bbox = (x1, y1, x2, y2) - self.left = False - self.right = False - self.top = False - self.bottom = False - self.text_objects = [] - self.text = '' - self.spanning_h = False - self.spanning_v = False - self.image = None - - def add_text(self, text): - """Adds text to cell. - - Parameters - ---------- - text : string - """ - self.text = ''.join([self.text, text]) - - def get_text(self): - """Returns text assigned to cell. - - Returns - ------- - text : string - """ - return self.text - - def add_object(self, t_object): - """Adds PDFMiner text object to cell. - - Parameters - ---------- - t_object : object - """ - self.text_objects.append(t_object) - - def get_objects(self): - """Returns list of text objects assigned to cell. - - Returns - ------- - text_objects : list - """ - return self.text_objects - - def get_bounded_edges(self): - """Returns the number of edges by which a cell is bounded. - - Returns - ------- - bounded_edges : int - """ - self.bounded_edges = self.top + self.bottom + self.left + self.right - return self.bounded_edges diff --git a/camelot/cli.py b/camelot/cli.py new file mode 100644 index 0000000..e69de29 diff --git a/camelot/table.py b/camelot/core.py similarity index 73% rename from camelot/table.py rename to camelot/core.py index fc1a45e..0934961 100644 --- a/camelot/table.py +++ b/camelot/core.py @@ -1,9 +1,138 @@ import numpy as np - -from .cell import Cell +import pandas as pd -class Table: +class Cell(object): + """Cell. + Defines a cell object with coordinates relative to a left-bottom + origin, which is also PDFMiner's coordinate space. + + Parameters + ---------- + x1 : float + x-coordinate of left-bottom point. + + y1 : float + y-coordinate of left-bottom point. + + x2 : float + x-coordinate of right-top point. + + y2 : float + y-coordinate of right-top point. + + Attributes + ---------- + lb : tuple + Tuple representing left-bottom coordinates. + + lt : tuple + Tuple representing left-top coordinates. + + rb : tuple + Tuple representing right-bottom coordinates. + + rt : tuple + Tuple representing right-top coordinates. + + bbox : tuple + Tuple representing the cell's bounding box using the + lower-bottom and right-top coordinates. + + left : bool + Whether or not cell is bounded on the left. + + right : bool + Whether or not cell is bounded on the right. + + top : bool + Whether or not cell is bounded on the top. + + bottom : bool + Whether or not cell is bounded on the bottom. + + text_objects : list + List of text objects assigned to cell. + + text : string + Text assigned to cell. + + spanning_h : bool + Whether or not cell spans/extends horizontally. + + spanning_v : bool + Whether or not cell spans/extends vertically. + """ + + def __init__(self, x1, y1, x2, y2): + + self.x1 = x1 + self.y1 = y1 + self.x2 = x2 + self.y2 = y2 + self.lb = (x1, y1) + self.lt = (x1, y2) + self.rb = (x2, y1) + self.rt = (x2, y2) + self.bbox = (x1, y1, x2, y2) + self.left = False + self.right = False + self.top = False + self.bottom = False + self.text_objects = [] + self.text = '' + self.spanning_h = False + self.spanning_v = False + self.image = None + + def add_text(self, text): + """Adds text to cell. + + Parameters + ---------- + text : string + """ + self.text = ''.join([self.text, text]) + + def get_text(self): + """Returns text assigned to cell. + + Returns + ------- + text : string + """ + return self.text + + def add_object(self, t_object): + """Adds PDFMiner text object to cell. + + Parameters + ---------- + t_object : object + """ + self.text_objects.append(t_object) + + def get_objects(self): + """Returns list of text objects assigned to cell. + + Returns + ------- + text_objects : list + """ + return self.text_objects + + def get_bounded_edges(self): + """Returns the number of edges by which a cell is bounded. + + Returns + ------- + bounded_edges : int + """ + self.bounded_edges = self.top + self.bottom + self.left + self.right + return self.bounded_edges + + +class Table(object): """Table. Defines a table object with coordinates relative to a left-bottom origin, which is also PDFMiner's coordinate space. @@ -234,3 +363,7 @@ class Table: ar.append([self.cells[r][c].get_text().strip() for c in range(len(self.cols))]) return ar + + +class TableSet(object): + pass \ No newline at end of file diff --git a/camelot/lattice.py b/camelot/lattice.py index 40803f6..2dd471d 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -8,9 +8,9 @@ import copy_reg import warnings import subprocess +from .core import Table from .imgproc import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) -from .table import Table from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, merge_close_values, get_table_index, get_score, count_empty, encode_list, get_text_objects, get_page_layout) diff --git a/camelot/stream.py b/camelot/stream.py index e794d6a..ae0ef4a 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -8,7 +8,7 @@ import warnings import numpy as np -from .table import Table +from .core import Table from .utils import (text_in_bbox, get_table_index, get_score, count_empty, encode_list, get_text_objects, get_page_layout)