Move cell and table to core

pull/2/head
Vinayak Mehta 2018-09-04 03:49:43 +05:30
parent ae64264d3e
commit c689735da2
7 changed files with 144 additions and 135 deletions

View File

@ -1,3 +1,6 @@
__version__ = '1.2.0'
from .core import *
from .__version__ import __version__
__all__ = ['pdf', 'lattice', 'stream']
def read_pdf(filepath, pages='1', grid=True):
pass

View File

@ -0,0 +1 @@
__version__ = '0.1.0'

View File

@ -1,128 +0,0 @@
class Cell:
"""Cell.
Defines a cell object with coordinates relative to a left-bottom
origin, which is also PDFMiner's coordinate space.
Parameters
----------
x1 : float
x-coordinate of left-bottom point.
y1 : float
y-coordinate of left-bottom point.
x2 : float
x-coordinate of right-top point.
y2 : float
y-coordinate of right-top point.
Attributes
----------
lb : tuple
Tuple representing left-bottom coordinates.
lt : tuple
Tuple representing left-top coordinates.
rb : tuple
Tuple representing right-bottom coordinates.
rt : tuple
Tuple representing right-top coordinates.
bbox : tuple
Tuple representing the cell's bounding box using the
lower-bottom and right-top coordinates.
left : bool
Whether or not cell is bounded on the left.
right : bool
Whether or not cell is bounded on the right.
top : bool
Whether or not cell is bounded on the top.
bottom : bool
Whether or not cell is bounded on the bottom.
text_objects : list
List of text objects assigned to cell.
text : string
Text assigned to cell.
spanning_h : bool
Whether or not cell spans/extends horizontally.
spanning_v : bool
Whether or not cell spans/extends vertically.
"""
def __init__(self, x1, y1, x2, y2):
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.text_objects = []
self.text = ''
self.spanning_h = False
self.spanning_v = False
self.image = None
def add_text(self, text):
"""Adds text to cell.
Parameters
----------
text : string
"""
self.text = ''.join([self.text, text])
def get_text(self):
"""Returns text assigned to cell.
Returns
-------
text : string
"""
return self.text
def add_object(self, t_object):
"""Adds PDFMiner text object to cell.
Parameters
----------
t_object : object
"""
self.text_objects.append(t_object)
def get_objects(self):
"""Returns list of text objects assigned to cell.
Returns
-------
text_objects : list
"""
return self.text_objects
def get_bounded_edges(self):
"""Returns the number of edges by which a cell is bounded.
Returns
-------
bounded_edges : int
"""
self.bounded_edges = self.top + self.bottom + self.left + self.right
return self.bounded_edges

0
camelot/cli.py 100644
View File

View File

@ -1,9 +1,138 @@
import numpy as np
from .cell import Cell
import pandas as pd
class Table:
class Cell(object):
"""Cell.
Defines a cell object with coordinates relative to a left-bottom
origin, which is also PDFMiner's coordinate space.
Parameters
----------
x1 : float
x-coordinate of left-bottom point.
y1 : float
y-coordinate of left-bottom point.
x2 : float
x-coordinate of right-top point.
y2 : float
y-coordinate of right-top point.
Attributes
----------
lb : tuple
Tuple representing left-bottom coordinates.
lt : tuple
Tuple representing left-top coordinates.
rb : tuple
Tuple representing right-bottom coordinates.
rt : tuple
Tuple representing right-top coordinates.
bbox : tuple
Tuple representing the cell's bounding box using the
lower-bottom and right-top coordinates.
left : bool
Whether or not cell is bounded on the left.
right : bool
Whether or not cell is bounded on the right.
top : bool
Whether or not cell is bounded on the top.
bottom : bool
Whether or not cell is bounded on the bottom.
text_objects : list
List of text objects assigned to cell.
text : string
Text assigned to cell.
spanning_h : bool
Whether or not cell spans/extends horizontally.
spanning_v : bool
Whether or not cell spans/extends vertically.
"""
def __init__(self, x1, y1, x2, y2):
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.text_objects = []
self.text = ''
self.spanning_h = False
self.spanning_v = False
self.image = None
def add_text(self, text):
"""Adds text to cell.
Parameters
----------
text : string
"""
self.text = ''.join([self.text, text])
def get_text(self):
"""Returns text assigned to cell.
Returns
-------
text : string
"""
return self.text
def add_object(self, t_object):
"""Adds PDFMiner text object to cell.
Parameters
----------
t_object : object
"""
self.text_objects.append(t_object)
def get_objects(self):
"""Returns list of text objects assigned to cell.
Returns
-------
text_objects : list
"""
return self.text_objects
def get_bounded_edges(self):
"""Returns the number of edges by which a cell is bounded.
Returns
-------
bounded_edges : int
"""
self.bounded_edges = self.top + self.bottom + self.left + self.right
return self.bounded_edges
class Table(object):
"""Table.
Defines a table object with coordinates relative to a left-bottom
origin, which is also PDFMiner's coordinate space.
@ -234,3 +363,7 @@ class Table:
ar.append([self.cells[r][c].get_text().strip()
for c in range(len(self.cols))])
return ar
class TableSet(object):
pass

View File

@ -8,9 +8,9 @@ import copy_reg
import warnings
import subprocess
from .core import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .table import Table
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout)

View File

@ -8,7 +8,7 @@ import warnings
import numpy as np
from .table import Table
from .core import Table
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout)