From 49d3f0f3aa1547d66eccb33ef0e4e0ebaad78d21 Mon Sep 17 00:00:00 2001 From: Frh Date: Fri, 10 Apr 2020 16:34:30 -0700 Subject: [PATCH] Rename table_bbox (singular) to table_areas The object is an index of bounding boxes, in some cases given by users. It's called areas in one section of the code making it systematic. --- camelot/parsers/lattice.py | 22 +++++++++++----------- camelot/parsers/stream.py | 28 ++++++++++++++-------------- camelot/plotting.py | 10 +++++----- tests/test_common.py | 1 + 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index a96f8df..eefd443 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -207,7 +207,7 @@ class Lattice(BaseParser): t.cells[i][j].text = t.cells[i - 1][j].text return t - def _generate_table_bbox(self): + def _generate_table_areas(self): def scale_areas(areas): scaled_areas = [] for area in areas: @@ -258,7 +258,7 @@ class Lattice(BaseParser): ) contours = find_contours(vertical_mask, horizontal_mask) - table_bbox = find_joints(contours, vertical_mask, horizontal_mask) + table_areas = find_joints(contours, vertical_mask, horizontal_mask) else: vertical_mask, vertical_segments = find_lines( self.threshold, @@ -274,20 +274,20 @@ class Lattice(BaseParser): ) areas = scale_areas(self.table_areas) - table_bbox = find_joints(areas, vertical_mask, horizontal_mask) + table_areas = find_joints(areas, vertical_mask, horizontal_mask) - self.table_bbox_unscaled = copy.deepcopy(table_bbox) + self.table_areas_unscaled = copy.deepcopy(table_areas) [ - self.table_bbox, + self.table_areas, self.vertical_segments, self.horizontal_segments ] = scale_image( - table_bbox, vertical_segments, horizontal_segments, pdf_scalers + table_areas, vertical_segments, horizontal_segments, pdf_scalers ) def _generate_columns_and_rows(self, table_idx, tk): - # select elements which lie within table_bbox + # select elements which lie within table_areas t_bbox = {} v_s, h_s = segments_in_bbox( tk, self.vertical_segments, self.horizontal_segments @@ -300,7 +300,7 @@ class Lattice(BaseParser): self.t_bbox = t_bbox - cols, rows = zip(*self.table_bbox[tk]) + cols, rows = zip(*self.table_areas[tk]) cols, rows = list(cols), list(rows) cols.extend([tk[0], tk[2]]) rows.extend([tk[1], tk[3]]) @@ -366,7 +366,7 @@ class Lattice(BaseParser): _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text - table._image = (self.pdf_image, self.table_bbox_unscaled) + table._image = (self.pdf_image, self.table_areas_unscaled) table._segments = (self.vertical_segments, self.horizontal_segments) table._textedges = None @@ -391,12 +391,12 @@ class Lattice(BaseParser): return [] self._generate_image_file() - self._generate_table_bbox() + self._generate_table_areas() _tables = [] # sort tables based on y-coord for table_idx, tk in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True) ): cols, rows, v_s, h_s = self._generate_columns_and_rows( table_idx, tk) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 0d393f3..ae236a1 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -299,14 +299,14 @@ class Stream(BaseParser): relevant_textedges = textedges.get_relevant() self.textedges.extend(relevant_textedges) # guess table areas using textlines and relevant edges - table_bbox = textedges.get_table_areas(textlines, relevant_textedges) + table_areas = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not table_bbox: - table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + if not table_areas: + table_areas = {(0, 0, self.pdf_width, self.pdf_height): None} - return table_bbox + return table_areas - def _generate_table_bbox(self): + def _generate_table_areas(self): self.textedges = [] if self.table_areas is None: all_text_segments = self.horizontal_text + self.vertical_text @@ -325,20 +325,20 @@ class Stream(BaseParser): (x1, y2, x2, y1), all_text_segments) text_segments.extend(region_text) # find tables based on nurminen's detection algorithm - table_bbox = self._nurminen_table_detection(text_segments) + table_areas = self._nurminen_table_detection(text_segments) else: - table_bbox = {} + table_areas = {} for area in self.table_areas: x1, y1, x2, y2 = area.split(",") x1 = float(x1) y1 = float(y1) x2 = float(x2) y2 = float(y2) - table_bbox[(x1, y2, x2, y1)] = None - self.table_bbox = table_bbox + table_areas[(x1, y2, x2, y1)] = None + self.table_areas = table_areas def _generate_columns_and_rows(self, table_idx, tk): - # select elements which lie within table_bbox + # select elements which lie within table_areas t_bbox = {} t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) @@ -464,7 +464,7 @@ class Stream(BaseParser): _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text self.generate_image() - table._image = (self.pdf_image, self.table_bbox) + table._image = (self.pdf_image, self.table_areas) table._segments = None table._textedges = self.textedges @@ -492,13 +492,13 @@ class Stream(BaseParser): return [] # Identify plausible areas within the doc where tables lie, - # populate table_bbox keys with these areas. - self._generate_table_bbox() + # populate table_areas keys with these areas. + self._generate_table_areas() _tables = [] # sort tables based on y-coord for table_idx, tk in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + sorted(self.table_areas.keys(), key=lambda x: x[1], reverse=True) ): cols, rows = self._generate_columns_and_rows(table_idx, tk) table = self._generate_table(table_idx, cols, rows) diff --git a/camelot/plotting.py b/camelot/plotting.py index 75d4449..07ba0a6 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -121,7 +121,7 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - img, table_bbox = table._image + img, table_areas = table._image _FOR_LATTICE = table.flavor == "lattice" fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") @@ -137,7 +137,7 @@ class PlotMethods(object): ) ) - for t in table_bbox.keys(): + for t in table_areas.keys(): ax.add_patch( patches.Rectangle( (t[0], t[1]), t[2] - t[0], t[3] - t[1], @@ -204,13 +204,13 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - img, table_bbox = table._image + img, table_areas = table._image fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") x_coord = [] y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: + for k in table_areas.keys(): + for coord in table_areas[k]: x_coord.append(coord[0]) y_coord.append(coord[1]) ax.plot(x_coord, y_coord, "ro") diff --git a/tests/test_common.py b/tests/test_common.py index 20941e8..ac20d0b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -192,6 +192,7 @@ def test_stream_vertical_header(): filename = os.path.join(testdir, "vertical_header.pdf") tables = camelot.read_pdf(filename, flavor="stream") + assert len(tables) == 1 assert_frame_equal(df, tables[0].df)