added test case for method bbox_no_intersection method
changed the test name to be more aligned with other testspull/252/head
parent
f43235934b
commit
82d0bf2881
|
|
@ -0,0 +1,56 @@
|
||||||
|
"Test to check intersection logic when no intersection area returned"
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.layout import (
|
||||||
|
LAParams,
|
||||||
|
LTAnno,
|
||||||
|
LTChar,
|
||||||
|
LTTextLineHorizontal,
|
||||||
|
LTTextLineVertical,
|
||||||
|
LTImage,
|
||||||
|
LTTextBoxHorizontal
|
||||||
|
)
|
||||||
|
|
||||||
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
from camelot.utils import bbox_intersection_area
|
||||||
|
|
||||||
|
def get_text_from_pdf(filename):
|
||||||
|
"Method to extract text object from pdf"
|
||||||
|
#https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
|
||||||
|
#https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
|
||||||
|
document = open(filename, 'rb')
|
||||||
|
#Create resource manager
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
# Set parameters for analysis.
|
||||||
|
laparams = LAParams()
|
||||||
|
# Create a PDF page aggregator object.
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in PDFPage.get_pages(document):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
# receive the LTPage object for the page.
|
||||||
|
layout = device.get_result()
|
||||||
|
for element in layout:
|
||||||
|
if isinstance(element, LTTextBoxHorizontal):
|
||||||
|
return element
|
||||||
|
|
||||||
|
def test_bbox_intersection_text():
|
||||||
|
"""
|
||||||
|
Test to check area of intersection between both boxes when no intersection area returned
|
||||||
|
"""
|
||||||
|
filename1 = os.path.join(testdir, "foo.pdf")
|
||||||
|
pdftextelement1 = get_text_from_pdf(filename1)
|
||||||
|
filename2 = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
|
pdftextelement2 = get_text_from_pdf(filename2)
|
||||||
|
|
||||||
|
assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0
|
||||||
Loading…
Reference in New Issue