parent
49d3f0f3aa
commit
6d62c84954
|
|
@ -207,7 +207,7 @@ class Lattice(BaseParser):
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def _generate_table_areas(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
scaled_areas = []
|
scaled_areas = []
|
||||||
for area in areas:
|
for area in areas:
|
||||||
|
|
@ -258,7 +258,7 @@ class Lattice(BaseParser):
|
||||||
)
|
)
|
||||||
|
|
||||||
contours = find_contours(vertical_mask, horizontal_mask)
|
contours = find_contours(vertical_mask, horizontal_mask)
|
||||||
table_areas = find_joints(contours, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
||||||
else:
|
else:
|
||||||
vertical_mask, vertical_segments = find_lines(
|
vertical_mask, vertical_segments = find_lines(
|
||||||
self.threshold,
|
self.threshold,
|
||||||
|
|
@ -274,20 +274,20 @@ class Lattice(BaseParser):
|
||||||
)
|
)
|
||||||
|
|
||||||
areas = scale_areas(self.table_areas)
|
areas = scale_areas(self.table_areas)
|
||||||
table_areas = find_joints(areas, vertical_mask, horizontal_mask)
|
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||||
|
|
||||||
self.table_areas_unscaled = copy.deepcopy(table_areas)
|
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||||
|
|
||||||
[
|
[
|
||||||
self.table_areas,
|
self.table_bbox,
|
||||||
self.vertical_segments,
|
self.vertical_segments,
|
||||||
self.horizontal_segments
|
self.horizontal_segments
|
||||||
] = scale_image(
|
] = scale_image(
|
||||||
table_areas, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
# select elements which lie within table_areas
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
t_bbox = {}
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
tk, self.vertical_segments, self.horizontal_segments
|
tk, self.vertical_segments, self.horizontal_segments
|
||||||
|
|
@ -300,7 +300,7 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
self.t_bbox = t_bbox
|
||||||
|
|
||||||
cols, rows = zip(*self.table_areas[tk])
|
cols, rows = zip(*self.table_bbox[tk])
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
cols.extend([tk[0], tk[2]])
|
cols.extend([tk[0], tk[2]])
|
||||||
rows.extend([tk[1], tk[3]])
|
rows.extend([tk[1], tk[3]])
|
||||||
|
|
@ -366,7 +366,7 @@ class Lattice(BaseParser):
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = (self.pdf_image, self.table_areas_unscaled)
|
table._image = (self.pdf_image, self.table_bbox_unscaled)
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
table._textedges = None
|
table._textedges = None
|
||||||
|
|
||||||
|
|
@ -391,12 +391,12 @@ class Lattice(BaseParser):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_image_file()
|
self._generate_image_file()
|
||||||
self._generate_table_areas()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, tk in enumerate(
|
for table_idx, tk in enumerate(
|
||||||
sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True)
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
):
|
):
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||||
table_idx, tk)
|
table_idx, tk)
|
||||||
|
|
|
||||||
|
|
@ -299,14 +299,14 @@ class Stream(BaseParser):
|
||||||
relevant_textedges = textedges.get_relevant()
|
relevant_textedges = textedges.get_relevant()
|
||||||
self.textedges.extend(relevant_textedges)
|
self.textedges.extend(relevant_textedges)
|
||||||
# guess table areas using textlines and relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
table_areas = textedges.get_table_areas(textlines, relevant_textedges)
|
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||||
# treat whole page as table area if no table areas found
|
# treat whole page as table area if no table areas found
|
||||||
if not table_areas:
|
if not table_bbox:
|
||||||
table_areas = {(0, 0, self.pdf_width, self.pdf_height): None}
|
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||||
|
|
||||||
return table_areas
|
return table_bbox
|
||||||
|
|
||||||
def _generate_table_areas(self):
|
def _generate_table_bbox(self):
|
||||||
self.textedges = []
|
self.textedges = []
|
||||||
if self.table_areas is None:
|
if self.table_areas is None:
|
||||||
all_text_segments = self.horizontal_text + self.vertical_text
|
all_text_segments = self.horizontal_text + self.vertical_text
|
||||||
|
|
@ -325,20 +325,20 @@ class Stream(BaseParser):
|
||||||
(x1, y2, x2, y1), all_text_segments)
|
(x1, y2, x2, y1), all_text_segments)
|
||||||
text_segments.extend(region_text)
|
text_segments.extend(region_text)
|
||||||
# find tables based on nurminen's detection algorithm
|
# find tables based on nurminen's detection algorithm
|
||||||
table_areas = self._nurminen_table_detection(text_segments)
|
table_bbox = self._nurminen_table_detection(text_segments)
|
||||||
else:
|
else:
|
||||||
table_areas = {}
|
table_bbox = {}
|
||||||
for area in self.table_areas:
|
for area in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
x1 = float(x1)
|
x1 = float(x1)
|
||||||
y1 = float(y1)
|
y1 = float(y1)
|
||||||
x2 = float(x2)
|
x2 = float(x2)
|
||||||
y2 = float(y2)
|
y2 = float(y2)
|
||||||
table_areas[(x1, y2, x2, y1)] = None
|
table_bbox[(x1, y2, x2, y1)] = None
|
||||||
self.table_areas = table_areas
|
self.table_bbox = table_bbox
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
# select elements which lie within table_areas
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
t_bbox = {}
|
||||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
||||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
||||||
|
|
@ -464,7 +464,7 @@ class Stream(BaseParser):
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
self.generate_image()
|
self.generate_image()
|
||||||
table._image = (self.pdf_image, self.table_areas)
|
table._image = (self.pdf_image, self.table_bbox)
|
||||||
table._segments = None
|
table._segments = None
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
||||||
|
|
@ -492,13 +492,13 @@ class Stream(BaseParser):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
# Identify plausible areas within the doc where tables lie,
|
||||||
# populate table_areas keys with these areas.
|
# populate table_bbox keys with these areas.
|
||||||
self._generate_table_areas()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, tk in enumerate(
|
for table_idx, tk in enumerate(
|
||||||
sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True)
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
):
|
):
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
|
|
|
||||||
|
|
@ -121,7 +121,7 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img, table_areas = table._image
|
img, table_bbox = table._image
|
||||||
_FOR_LATTICE = table.flavor == "lattice"
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
|
@ -137,7 +137,7 @@ class PlotMethods(object):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for t in table_areas.keys():
|
for t in table_bbox.keys():
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||||
|
|
@ -204,13 +204,13 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img, table_areas = table._image
|
img, table_bbox = table._image
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
for k in table_areas.keys():
|
for k in table_bbox.keys():
|
||||||
for coord in table_areas[k]:
|
for coord in table_bbox[k]:
|
||||||
x_coord.append(coord[0])
|
x_coord.append(coord[0])
|
||||||
y_coord.append(coord[1])
|
y_coord.append(coord[1])
|
||||||
ax.plot(x_coord, y_coord, "ro")
|
ax.plot(x_coord, y_coord, "ro")
|
||||||
|
|
|
||||||
|
|
@ -192,7 +192,6 @@ def test_stream_vertical_header():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "vertical_header.pdf")
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
assert len(tables) == 1
|
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue