camelot-py/camelot/parsers/lattice_ocr.py

# -*- coding: utf-8 -*-

import os
import copy
import logging
import subprocess

try:
    import easyocr
except ImportError:
    _HAS_EASYOCR = False
else:
    _HAS_EASYOCR = True

import pandas as pd
from PIL import Image

from .base import BaseParser
from ..core import Table
from ..utils import TemporaryDirectory, merge_close_lines, scale_pdf, segments_in_bbox
from ..image_processing import (
    adaptive_threshold,
    find_lines,
    find_contours,
    find_joints,
)


logger = logging.getLogger("camelot")


class LatticeOCR(BaseParser):
    def __init__(
        self,
        table_regions=None,
        table_areas=None,
        line_scale=15,
        line_tol=2,
        joint_tol=2,
        threshold_blocksize=15,
        threshold_constant=-2,
        iterations=0,
        resolution=300,
    ):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.line_scale = line_scale
        self.line_tol = line_tol
        self.joint_tol = joint_tol
        self.threshold_blocksize = threshold_blocksize
        self.threshold_constant = threshold_constant
        self.iterations = iterations
        self.resolution = resolution

        if _HAS_EASYOCR:
            self.reader = easyocr.Reader(['en'], gpu=False)
        else:
            raise ImportError("easyocr is required to run OCR on image-based PDFs.")

    def _generate_image(self):
        from ..ext.ghostscript import Ghostscript

        self.imagename = "".join([self.rootname, ".png"])
        gs_call = "-q -sDEVICE=png16m -o {} -r900 {}".format(
            self.imagename, self.filename
        )
        gs_call = gs_call.encode().split()
        null = open(os.devnull, "wb")
        with Ghostscript(*gs_call, stdout=null) as gs:
            pass
        null.close()

    def _generate_table_bbox(self):
        def scale_areas(areas, scalers):
            scaled_areas = []
            for area in areas:
                x1, y1, x2, y2 = area.split(",")
                x1 = float(x1)
                y1 = float(y1)
                x2 = float(x2)
                y2 = float(y2)
                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), scalers)
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas

        self.image, self.threshold = adaptive_threshold(
            self.imagename, blocksize=self.threshold_blocksize, c=self.threshold_constant
        )

        image_width = self.image.shape[1]
        image_height = self.image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
        image_height_scaler = image_height / float(self.pdf_height)
        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)

        if self.table_areas is None:
            regions = None
            if self.table_regions is not None:
                regions = scale_areas(self.table_regions, image_scalers)

            vertical_mask, vertical_segments = find_lines(
                self.threshold,
                regions=regions,
                direction="vertical",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )
            horizontal_mask, horizontal_segments = find_lines(
                self.threshold,
                regions=regions,
                direction="horizontal",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )

            contours = find_contours(vertical_mask, horizontal_mask)
            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
        else:
            vertical_mask, vertical_segments = find_lines(
                self.threshold,
                direction="vertical",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )
            horizontal_mask, horizontal_segments = find_lines(
                self.threshold,
                direction="horizontal",
                line_scale=self.line_scale,
                iterations=self.iterations,
            )

            areas = scale_areas(self.table_areas, image_scalers)
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)

        self.table_bbox_unscaled = copy.deepcopy(table_bbox)

        self.table_bbox = table_bbox
        self.vertical_segments = vertical_segments
        self.horizontal_segments = horizontal_segments

    def _generate_columns_and_rows(self, table_idx, tk):
        cols, rows = zip(*self.table_bbox[tk])
        cols, rows = list(cols), list(rows)
        cols.extend([tk[0], tk[2]])
        rows.extend([tk[1], tk[3]])
        # sort horizontal and vertical segments
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
        rows = merge_close_lines(sorted(rows), line_tol=self.line_tol)
        # make grid using x and y coord of shortlisted rows and cols
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]

        return cols, rows


    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = Table(cols, rows)
        # set table edges to True using ver+hor lines
        table = table.set_edges(self.vertical_segments, self.horizontal_segments, joint_tol=self.joint_tol)
        # set table border edges to True
        table = table.set_border()
        # set spanning cells to True
        table = table.set_span()

        _seen = set()
        for r_idx in range(len(table.cells)):
            for c_idx in range(len(table.cells[r_idx])):
                if (r_idx, c_idx) in _seen:
                    continue

                _seen.add((r_idx, c_idx))

                _r_idx = r_idx
                _c_idx = c_idx

                if table.cells[r_idx][_c_idx].hspan:
                    while not table.cells[r_idx][_c_idx].right:
                        _c_idx += 1
                        _seen.add((r_idx, _c_idx))

                if table.cells[_r_idx][c_idx].vspan:
                    while not table.cells[_r_idx][c_idx].bottom:
                        _r_idx += 1
                        _seen.add((_r_idx, c_idx))

                for i in range(r_idx, _r_idx + 1):
                    for j in range(c_idx, _c_idx + 1):
                        _seen.add((i, j))

                x1 = int(table.cells[r_idx][c_idx].x1)
                y1 = int(table.cells[_r_idx][_c_idx].y1)

                x2 = int(table.cells[_r_idx][_c_idx].x2)
                y2 = int(table.cells[r_idx][c_idx].y2)

                with TemporaryDirectory() as tempdir:
                    temp_image_path = os.path.join(tempdir, f"{table_idx}_{r_idx}_{c_idx}.png")

                    cell_image = Image.fromarray(self.image[y2:y1, x1:x2])
                    cell_image.save(temp_image_path)

                    text = self.reader.readtext(temp_image_path, detail=0)
                    text = " ".join(text)

                table.cells[r_idx][c_idx].text = text

        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape

        table.flavor = "lattice_ocr"
        table.accuracy = 0
        table.whitespace = 0
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))

        # for plotting
        table._text = None
        table._image = (self.image, self.table_bbox_unscaled)
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None

        return table

    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
        self._generate_layout(filename, layout_kwargs)
        if not suppress_stdout:
            logger.info("Processing {}".format(os.path.basename(self.rootname)))

        self._generate_image()
        self._generate_table_bbox()

        _tables = []
        # sort tables based on y-coord
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows = self._generate_columns_and_rows(table_idx, tk)
            table = self._generate_table(table_idx, cols, rows)
            table._bbox = tk
            _tables.append(table)

        return _tables