Move cell and table to core
parent
ae64264d3e
commit
c689735da2
|
|
@ -1,3 +1,6 @@
|
||||||
__version__ = '1.2.0'
|
from .core import *
|
||||||
|
from .__version__ import __version__
|
||||||
|
|
||||||
__all__ = ['pdf', 'lattice', 'stream']
|
|
||||||
|
def read_pdf(filepath, pages='1', grid=True):
|
||||||
|
pass
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
__version__ = '0.1.0'
|
||||||
128
camelot/cell.py
128
camelot/cell.py
|
|
@ -1,128 +0,0 @@
|
||||||
class Cell:
|
|
||||||
"""Cell.
|
|
||||||
Defines a cell object with coordinates relative to a left-bottom
|
|
||||||
origin, which is also PDFMiner's coordinate space.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x1 : float
|
|
||||||
x-coordinate of left-bottom point.
|
|
||||||
|
|
||||||
y1 : float
|
|
||||||
y-coordinate of left-bottom point.
|
|
||||||
|
|
||||||
x2 : float
|
|
||||||
x-coordinate of right-top point.
|
|
||||||
|
|
||||||
y2 : float
|
|
||||||
y-coordinate of right-top point.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
lb : tuple
|
|
||||||
Tuple representing left-bottom coordinates.
|
|
||||||
|
|
||||||
lt : tuple
|
|
||||||
Tuple representing left-top coordinates.
|
|
||||||
|
|
||||||
rb : tuple
|
|
||||||
Tuple representing right-bottom coordinates.
|
|
||||||
|
|
||||||
rt : tuple
|
|
||||||
Tuple representing right-top coordinates.
|
|
||||||
|
|
||||||
bbox : tuple
|
|
||||||
Tuple representing the cell's bounding box using the
|
|
||||||
lower-bottom and right-top coordinates.
|
|
||||||
|
|
||||||
left : bool
|
|
||||||
Whether or not cell is bounded on the left.
|
|
||||||
|
|
||||||
right : bool
|
|
||||||
Whether or not cell is bounded on the right.
|
|
||||||
|
|
||||||
top : bool
|
|
||||||
Whether or not cell is bounded on the top.
|
|
||||||
|
|
||||||
bottom : bool
|
|
||||||
Whether or not cell is bounded on the bottom.
|
|
||||||
|
|
||||||
text_objects : list
|
|
||||||
List of text objects assigned to cell.
|
|
||||||
|
|
||||||
text : string
|
|
||||||
Text assigned to cell.
|
|
||||||
|
|
||||||
spanning_h : bool
|
|
||||||
Whether or not cell spans/extends horizontally.
|
|
||||||
|
|
||||||
spanning_v : bool
|
|
||||||
Whether or not cell spans/extends vertically.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, x1, y1, x2, y2):
|
|
||||||
|
|
||||||
self.x1 = x1
|
|
||||||
self.y1 = y1
|
|
||||||
self.x2 = x2
|
|
||||||
self.y2 = y2
|
|
||||||
self.lb = (x1, y1)
|
|
||||||
self.lt = (x1, y2)
|
|
||||||
self.rb = (x2, y1)
|
|
||||||
self.rt = (x2, y2)
|
|
||||||
self.bbox = (x1, y1, x2, y2)
|
|
||||||
self.left = False
|
|
||||||
self.right = False
|
|
||||||
self.top = False
|
|
||||||
self.bottom = False
|
|
||||||
self.text_objects = []
|
|
||||||
self.text = ''
|
|
||||||
self.spanning_h = False
|
|
||||||
self.spanning_v = False
|
|
||||||
self.image = None
|
|
||||||
|
|
||||||
def add_text(self, text):
|
|
||||||
"""Adds text to cell.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text : string
|
|
||||||
"""
|
|
||||||
self.text = ''.join([self.text, text])
|
|
||||||
|
|
||||||
def get_text(self):
|
|
||||||
"""Returns text assigned to cell.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text : string
|
|
||||||
"""
|
|
||||||
return self.text
|
|
||||||
|
|
||||||
def add_object(self, t_object):
|
|
||||||
"""Adds PDFMiner text object to cell.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_object : object
|
|
||||||
"""
|
|
||||||
self.text_objects.append(t_object)
|
|
||||||
|
|
||||||
def get_objects(self):
|
|
||||||
"""Returns list of text objects assigned to cell.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text_objects : list
|
|
||||||
"""
|
|
||||||
return self.text_objects
|
|
||||||
|
|
||||||
def get_bounded_edges(self):
|
|
||||||
"""Returns the number of edges by which a cell is bounded.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bounded_edges : int
|
|
||||||
"""
|
|
||||||
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
|
||||||
return self.bounded_edges
|
|
||||||
|
|
@ -1,9 +1,138 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from .cell import Cell
|
|
||||||
|
|
||||||
|
|
||||||
class Table:
|
class Cell(object):
|
||||||
|
"""Cell.
|
||||||
|
Defines a cell object with coordinates relative to a left-bottom
|
||||||
|
origin, which is also PDFMiner's coordinate space.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x1 : float
|
||||||
|
x-coordinate of left-bottom point.
|
||||||
|
|
||||||
|
y1 : float
|
||||||
|
y-coordinate of left-bottom point.
|
||||||
|
|
||||||
|
x2 : float
|
||||||
|
x-coordinate of right-top point.
|
||||||
|
|
||||||
|
y2 : float
|
||||||
|
y-coordinate of right-top point.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
lb : tuple
|
||||||
|
Tuple representing left-bottom coordinates.
|
||||||
|
|
||||||
|
lt : tuple
|
||||||
|
Tuple representing left-top coordinates.
|
||||||
|
|
||||||
|
rb : tuple
|
||||||
|
Tuple representing right-bottom coordinates.
|
||||||
|
|
||||||
|
rt : tuple
|
||||||
|
Tuple representing right-top coordinates.
|
||||||
|
|
||||||
|
bbox : tuple
|
||||||
|
Tuple representing the cell's bounding box using the
|
||||||
|
lower-bottom and right-top coordinates.
|
||||||
|
|
||||||
|
left : bool
|
||||||
|
Whether or not cell is bounded on the left.
|
||||||
|
|
||||||
|
right : bool
|
||||||
|
Whether or not cell is bounded on the right.
|
||||||
|
|
||||||
|
top : bool
|
||||||
|
Whether or not cell is bounded on the top.
|
||||||
|
|
||||||
|
bottom : bool
|
||||||
|
Whether or not cell is bounded on the bottom.
|
||||||
|
|
||||||
|
text_objects : list
|
||||||
|
List of text objects assigned to cell.
|
||||||
|
|
||||||
|
text : string
|
||||||
|
Text assigned to cell.
|
||||||
|
|
||||||
|
spanning_h : bool
|
||||||
|
Whether or not cell spans/extends horizontally.
|
||||||
|
|
||||||
|
spanning_v : bool
|
||||||
|
Whether or not cell spans/extends vertically.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, x1, y1, x2, y2):
|
||||||
|
|
||||||
|
self.x1 = x1
|
||||||
|
self.y1 = y1
|
||||||
|
self.x2 = x2
|
||||||
|
self.y2 = y2
|
||||||
|
self.lb = (x1, y1)
|
||||||
|
self.lt = (x1, y2)
|
||||||
|
self.rb = (x2, y1)
|
||||||
|
self.rt = (x2, y2)
|
||||||
|
self.bbox = (x1, y1, x2, y2)
|
||||||
|
self.left = False
|
||||||
|
self.right = False
|
||||||
|
self.top = False
|
||||||
|
self.bottom = False
|
||||||
|
self.text_objects = []
|
||||||
|
self.text = ''
|
||||||
|
self.spanning_h = False
|
||||||
|
self.spanning_v = False
|
||||||
|
self.image = None
|
||||||
|
|
||||||
|
def add_text(self, text):
|
||||||
|
"""Adds text to cell.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : string
|
||||||
|
"""
|
||||||
|
self.text = ''.join([self.text, text])
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
"""Returns text assigned to cell.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text : string
|
||||||
|
"""
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def add_object(self, t_object):
|
||||||
|
"""Adds PDFMiner text object to cell.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t_object : object
|
||||||
|
"""
|
||||||
|
self.text_objects.append(t_object)
|
||||||
|
|
||||||
|
def get_objects(self):
|
||||||
|
"""Returns list of text objects assigned to cell.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text_objects : list
|
||||||
|
"""
|
||||||
|
return self.text_objects
|
||||||
|
|
||||||
|
def get_bounded_edges(self):
|
||||||
|
"""Returns the number of edges by which a cell is bounded.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bounded_edges : int
|
||||||
|
"""
|
||||||
|
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
||||||
|
return self.bounded_edges
|
||||||
|
|
||||||
|
|
||||||
|
class Table(object):
|
||||||
"""Table.
|
"""Table.
|
||||||
Defines a table object with coordinates relative to a left-bottom
|
Defines a table object with coordinates relative to a left-bottom
|
||||||
origin, which is also PDFMiner's coordinate space.
|
origin, which is also PDFMiner's coordinate space.
|
||||||
|
|
@ -234,3 +363,7 @@ class Table:
|
||||||
ar.append([self.cells[r][c].get_text().strip()
|
ar.append([self.cells[r][c].get_text().strip()
|
||||||
for c in range(len(self.cols))])
|
for c in range(len(self.cols))])
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
|
class TableSet(object):
|
||||||
|
pass
|
||||||
|
|
@ -8,9 +8,9 @@ import copy_reg
|
||||||
import warnings
|
import warnings
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
from .core import Table
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints)
|
find_table_joints)
|
||||||
from .table import Table
|
|
||||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||||
merge_close_values, get_table_index, get_score, count_empty,
|
merge_close_values, get_table_index, get_score, count_empty,
|
||||||
encode_list, get_text_objects, get_page_layout)
|
encode_list, get_text_objects, get_page_layout)
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .table import Table
|
from .core import Table
|
||||||
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
||||||
encode_list, get_text_objects, get_page_layout)
|
encode_list, get_text_objects, get_page_layout)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue