Add support for table areas and regions

2020-12-06 07:04:47 +05:30 · 2020-12-06 07:04:47 +05:30 · 6612a2f58d
parent 1287abeeaf
commit 6612a2f58d
1 changed files with 54 additions and 25 deletions
--- a/camelot/parsers/lattice_ocr.py
+++ b/camelot/parsers/lattice_ocr.py
@ -17,7 +17,7 @@ from PIL import Image
 from .base import BaseParser
 from ..core import Table
-from ..utils import TemporaryDirectory, merge_close_lines, scale_image, segments_in_bbox
+from ..utils import TemporaryDirectory, merge_close_lines, scale_pdf, segments_in_bbox
 from ..image_processing import (
    adaptive_threshold,
    find_lines,
@ -32,6 +32,7 @@ logger = logging.getLogger("camelot")
 class LatticeOCR(BaseParser):
    def __init__(
        self,
        table_regions=None,
        table_areas=None,
        line_scale=15,
        line_tol=2,
@ -41,6 +42,7 @@ class LatticeOCR(BaseParser):
        iterations=0,
        resolution=300,
    ):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.line_scale = line_scale
        self.line_tol = line_tol
@ -59,7 +61,7 @@ class LatticeOCR(BaseParser):
        from ..ext.ghostscript import Ghostscript
        self.imagename = "".join([self.rootname, ".png"])
-        gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
+        gs_call = "-q -sDEVICE=png16m -o {} -r900 {}".format(
            self.imagename, self.filename
        )
        gs_call = gs_call.encode().split()
@ -69,13 +71,51 @@ class LatticeOCR(BaseParser):
        null.close()
    def _generate_table_bbox(self):
        def scale_areas(areas, scalers):
            scaled_areas = []
            for area in areas:
                x1, y1, x2, y2 = area.split(",")
                x1 = float(x1)
                y1 = float(y1)
                x2 = float(x2)
                y2 = float(y2)
                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), scalers)
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas
        self.image, self.threshold = adaptive_threshold(
            self.imagename, blocksize=self.threshold_blocksize, c=self.threshold_constant
        )
        image_width = self.image.shape[1]
        image_height = self.image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
        image_height_scaler = image_height / float(self.pdf_height)
        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
        if self.table_areas is None:
            regions = None
            if self.table_regions is not None:
                regions = scale_areas(self.table_regions, image_scalers)
            vertical_mask, vertical_segments = find_lines(
                self.threshold,
                regions=regions,
                direction="vertical",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )
            horizontal_mask, horizontal_segments = find_lines(
                self.threshold,
                regions=regions,
                direction="horizontal",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )
            contours = find_contours(vertical_mask, horizontal_mask)
            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
        else:
            vertical_mask, vertical_segments = find_lines(
                self.threshold,
                direction="vertical",
@ -89,19 +129,8 @@ class LatticeOCR(BaseParser):
                iterations=self.iterations,
            )
-        if self.table_areas is not None:
+            areas = scale_areas(self.table_areas, image_scalers)
            areas = []
            for area in self.table_areas:
                x1, y1, x2, y2 = area.split(",")
                x1 = int(float(x1))
                y1 = int(float(y1))
                x2 = int(float(x2))
                y2 = int(float(y2))
                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
        else:
            contours = find_contours(vertical_mask, horizontal_mask)
            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
        self.table_bbox_unscaled = copy.deepcopy(table_bbox)