pull/252/merge
rahulbhave 2021-07-12 12:12:20 +05:30 committed by GitHub
commit e10906e8b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 56 additions and 0 deletions

View File

@ -0,0 +1,56 @@
"Test to check intersection logic when no intersection area returned"
import os
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTTextLineHorizontal,
LTTextLineVertical,
LTImage,
LTTextBoxHorizontal
)
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
from camelot.utils import bbox_intersection_area
def get_text_from_pdf(filename):
"Method to extract text object from pdf"
#https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
#https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
document = open(filename, 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(document):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for element in layout:
if isinstance(element, LTTextBoxHorizontal):
return element
def test_bbox_intersection_text():
"""
Test to check area of intersection between both boxes when no intersection area returned
"""
filename1 = os.path.join(testdir, "foo.pdf")
pdftextelement1 = get_text_from_pdf(filename1)
filename2 = os.path.join(testdir, "tabula/12s0324.pdf")
pdftextelement2 = get_text_from_pdf(filename2)
assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0