Rename table_bbox (singular) to table_areas

The object is an index of bounding boxes, in some cases given by users.
It's called areas in one section of the code making it systematic.
pull/127/head
Frh 2020-04-10 16:34:30 -07:00
parent 270c76a3e7
commit 49d3f0f3aa
4 changed files with 31 additions and 30 deletions

View File

@ -207,7 +207,7 @@ class Lattice(BaseParser):
t.cells[i][j].text = t.cells[i - 1][j].text t.cells[i][j].text = t.cells[i - 1][j].text
return t return t
def _generate_table_bbox(self): def _generate_table_areas(self):
def scale_areas(areas): def scale_areas(areas):
scaled_areas = [] scaled_areas = []
for area in areas: for area in areas:
@ -258,7 +258,7 @@ class Lattice(BaseParser):
) )
contours = find_contours(vertical_mask, horizontal_mask) contours = find_contours(vertical_mask, horizontal_mask)
table_bbox = find_joints(contours, vertical_mask, horizontal_mask) table_areas = find_joints(contours, vertical_mask, horizontal_mask)
else: else:
vertical_mask, vertical_segments = find_lines( vertical_mask, vertical_segments = find_lines(
self.threshold, self.threshold,
@ -274,20 +274,20 @@ class Lattice(BaseParser):
) )
areas = scale_areas(self.table_areas) areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask) table_areas = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_areas_unscaled = copy.deepcopy(table_areas)
[ [
self.table_bbox, self.table_areas,
self.vertical_segments, self.vertical_segments,
self.horizontal_segments self.horizontal_segments
] = scale_image( ] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_areas, vertical_segments, horizontal_segments, pdf_scalers
) )
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox # select elements which lie within table_areas
t_bbox = {} t_bbox = {}
v_s, h_s = segments_in_bbox( v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments tk, self.vertical_segments, self.horizontal_segments
@ -300,7 +300,7 @@ class Lattice(BaseParser):
self.t_bbox = t_bbox self.t_bbox = t_bbox
cols, rows = zip(*self.table_bbox[tk]) cols, rows = zip(*self.table_areas[tk])
cols, rows = list(cols), list(rows) cols, rows = list(cols), list(rows)
cols.extend([tk[0], tk[2]]) cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
@ -366,7 +366,7 @@ class Lattice(BaseParser):
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table._image = (self.pdf_image, self.table_bbox_unscaled) table._image = (self.pdf_image, self.table_areas_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments) table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None table._textedges = None
@ -391,12 +391,12 @@ class Lattice(BaseParser):
return [] return []
self._generate_image_file() self._generate_image_file()
self._generate_table_bbox() self._generate_table_areas()
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate( for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True)
): ):
cols, rows, v_s, h_s = self._generate_columns_and_rows( cols, rows, v_s, h_s = self._generate_columns_and_rows(
table_idx, tk) table_idx, tk)

View File

@ -299,14 +299,14 @@ class Stream(BaseParser):
relevant_textedges = textedges.get_relevant() relevant_textedges = textedges.get_relevant()
self.textedges.extend(relevant_textedges) self.textedges.extend(relevant_textedges)
# guess table areas using textlines and relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_areas = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found # treat whole page as table area if no table areas found
if not table_bbox: if not table_areas:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_areas = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox return table_areas
def _generate_table_bbox(self): def _generate_table_areas(self):
self.textedges = [] self.textedges = []
if self.table_areas is None: if self.table_areas is None:
all_text_segments = self.horizontal_text + self.vertical_text all_text_segments = self.horizontal_text + self.vertical_text
@ -325,20 +325,20 @@ class Stream(BaseParser):
(x1, y2, x2, y1), all_text_segments) (x1, y2, x2, y1), all_text_segments)
text_segments.extend(region_text) text_segments.extend(region_text)
# find tables based on nurminen's detection algorithm # find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(text_segments) table_areas = self._nurminen_table_detection(text_segments)
else: else:
table_bbox = {} table_areas = {}
for area in self.table_areas: for area in self.table_areas:
x1, y1, x2, y2 = area.split(",") x1, y1, x2, y2 = area.split(",")
x1 = float(x1) x1 = float(x1)
y1 = float(y1) y1 = float(y1)
x2 = float(x2) x2 = float(x2)
y2 = float(y2) y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None table_areas[(x1, y2, x2, y1)] = None
self.table_bbox = table_bbox self.table_areas = table_areas
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox # select elements which lie within table_areas
t_bbox = {} t_bbox = {}
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
@ -464,7 +464,7 @@ class Stream(BaseParser):
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
self.generate_image() self.generate_image()
table._image = (self.pdf_image, self.table_bbox) table._image = (self.pdf_image, self.table_areas)
table._segments = None table._segments = None
table._textedges = self.textedges table._textedges = self.textedges
@ -492,13 +492,13 @@ class Stream(BaseParser):
return [] return []
# Identify plausible areas within the doc where tables lie, # Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas. # populate table_areas keys with these areas.
self._generate_table_bbox() self._generate_table_areas()
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord
for table_idx, tk in enumerate( for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True)
): ):
cols, rows = self._generate_columns_and_rows(table_idx, tk) cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows) table = self._generate_table(table_idx, cols, rows)

View File

@ -121,7 +121,7 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
img, table_bbox = table._image img, table_areas = table._image
_FOR_LATTICE = table.flavor == "lattice" _FOR_LATTICE = table.flavor == "lattice"
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
@ -137,7 +137,7 @@ class PlotMethods(object):
) )
) )
for t in table_bbox.keys(): for t in table_areas.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], (t[0], t[1]), t[2] - t[0], t[3] - t[1],
@ -204,13 +204,13 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
img, table_bbox = table._image img, table_areas = table._image
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
x_coord = [] x_coord = []
y_coord = [] y_coord = []
for k in table_bbox.keys(): for k in table_areas.keys():
for coord in table_bbox[k]: for coord in table_areas[k]:
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro") ax.plot(x_coord, y_coord, "ro")

View File

@ -192,6 +192,7 @@ def test_stream_vertical_header():
filename = os.path.join(testdir, "vertical_header.pdf") filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="stream") tables = camelot.read_pdf(filename, flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)