added test case for method bbox_no_intersection method

changed the test name to be more aligned with other tests
pull/252/head
Rahul.Bhave 2021-07-09 23:37:50 +05:30
parent f43235934b
commit 82d0bf2881
1 changed files with 56 additions and 0 deletions

View File

@ -0,0 +1,56 @@
"Test to check intersection logic when no intersection area returned"
import os
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTTextLineHorizontal,
LTTextLineVertical,
LTImage,
LTTextBoxHorizontal
)
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
from camelot.utils import bbox_intersection_area
def get_text_from_pdf(filename):
"Method to extract text object from pdf"
#https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
#https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
document = open(filename, 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(document):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for element in layout:
if isinstance(element, LTTextBoxHorizontal):
return element
def test_bbox_intersection_text():
"""
Test to check area of intersection between both boxes when no intersection area returned
"""
filename1 = os.path.join(testdir, "foo.pdf")
pdftextelement1 = get_text_from_pdf(filename1)
filename2 = os.path.join(testdir, "tabula/12s0324.pdf")
pdftextelement2 = get_text_from_pdf(filename2)
assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0