Merge pull request #243 from socialcopsdev/add-table-regions

[MRG] Add table regions support
pull/2/head
Vinayak Mehta 2019-01-04 22:00:11 +05:30 committed by GitHub
commit 7cf409aa08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 162 additions and 60 deletions

View File

@ -4,6 +4,13 @@ Release History
master master
------ ------
**Improvements**
* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
* Kwarg `line_size_scaling` is now called `line_scale`.
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
0.6.0 (2018-12-24) 0.6.0 (2018-12-24)
------------------ ------------------

View File

@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
@cli.command('lattice') @cli.command('lattice')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True, @click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2' help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True, @click.option('-back', '--process_background', is_flag=True,
help='Process background lines.') help='Process background lines.')
@click.option('-scale', '--line_size_scaling', default=15, @click.option('-scale', '--line_scale', default=15,
help='Line size scaling factor. The larger the value,' help='Line size scaling factor. The larger the value,'
' the smaller the detected lines.') ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
filepath = kwargs.pop('filepath') filepath = kwargs.pop('filepath')
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs['table_areas'] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text']) copy_text = list(kwargs['copy_text'])
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
@cli.command('stream') @cli.command('stream')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True, @click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2' help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
filepath = kwargs.pop('filepath') filepath = kwargs.pop('filepath')
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs['table_areas'] = None if not table_areas else table_areas
columns = list(kwargs['columns']) columns = list(kwargs['columns'])

View File

@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
return img, threshold return img, threshold
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): def find_lines(threshold, regions=None, direction='horizontal',
line_scale=15, iterations=0):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
---------- ----------
threshold : object threshold : object
numpy.ndarray representing the thresholded image. numpy.ndarray representing the thresholded image.
regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in image coordinate space.
direction : string, optional (default: 'horizontal') direction : string, optional (default: 'horizontal')
Specifies whether to find vertical or horizontal lines. Specifies whether to find vertical or horizontal lines.
line_size_scaling : int, optional (default: 15) line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected. smallest length of lines that should be detected.
@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
lines = [] lines = []
if direction == 'vertical': if direction == 'vertical':
size = threshold.shape[0] // line_size_scaling size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal': elif direction == 'horizontal':
size = threshold.shape[1] // line_size_scaling size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError("Specify direction as either 'vertical' or" raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'") " 'horizontal'")
if regions is not None:
region_mask = np.zeros(threshold.shape)
for region in regions:
x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1
threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el) threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el) threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations) dmask = cv2.dilate(threshold, el, iterations=iterations)
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for c in contours: for c in contours:
x, y, w, h = cv2.boundingRect(c) x, y, w, h = cv2.boundingRect(c)
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
return dmask, lines return dmask, lines
def find_table_contours(vertical, horizontal): def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours. """Finds table boundaries using OpenCV's findContours.
Parameters Parameters
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
try: try:
__, contours, __ = cv2.findContours( __, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, __ = cv2.findContours( contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
cont = [] cont = []
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
return cont return cont
def find_table_joints(contours, vertical, horizontal): def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary. """Finds joints/intersections present inside each table boundary.
Parameters Parameters
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
and (x2, y2) -> rt in image coordinate space. and (x2, y2) -> rt in image coordinate space.
""" """
joints = np.bitwise_and(vertical, horizontal) joints = np.multiply(vertical, horizontal)
tables = {} tables = {}
for c in contours: for c in contours:
x, y, w, h = c x, y, w, h = c
roi = joints[y : y + h, x : x + w] roi = joints[y : y + h, x : x + w]
try: try:
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
jc, __ = cv2.findContours( jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints if len(jc) <= 4: # remove contours with less than 4 joints
continue continue
joint_coords = [] joint_coords = []

View File

@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
to generate columns. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
Process background lines. Process background lines.
line_size_scaling* : int, optional (default: 15) line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text the detected lines. Making it very large will lead to text
being detected as lines. being detected as lines.

View File

@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy, merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace) compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines, from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints) find_contours, find_joints)
logger = logging.getLogger('camelot') logger = logging.getLogger('camelot')
@ -28,13 +28,17 @@ class Lattice(BaseParser):
Parameters Parameters
---------- ----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None) table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2 List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space. in PDF coordinate space.
process_background : bool, optional (default: False) process_background : bool, optional (default: False)
Process background lines. Process background lines.
line_size_scaling : int, optional (default: 15) line_scale : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text the detected lines. Making it very large will lead to text
being detected as lines. being detected as lines.
@ -77,14 +81,15 @@ class Lattice(BaseParser):
Resolution used for PDF to PNG conversion. Resolution used for PDF to PNG conversion.
""" """
def __init__(self, table_areas=None, process_background=False, def __init__(self, table_regions=None, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_scale=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, strip_text='', line_tol=2, split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs): iterations=0, resolution=300, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
self.line_size_scaling = line_size_scaling self.line_scale = line_scale
self.copy_text = copy_text self.copy_text = copy_text
self.shift_text = shift_text self.shift_text = shift_text
self.split_text = split_text self.split_text = split_text
@ -227,9 +232,22 @@ class Lattice(BaseParser):
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas):
scaled_areas = []
for area in areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas
self.image, self.threshold = adaptive_threshold( self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background, self.imagename, process_background=self.process_background,
blocksize=self.threshold_blocksize, c=self.threshold_constant) blocksize=self.threshold_blocksize, c=self.threshold_constant)
image_width = self.image.shape[1] image_width = self.image.shape[1]
image_height = self.image.shape[0] image_height = self.image.shape[0]
image_width_scaler = image_width / float(self.pdf_width) image_width_scaler = image_width / float(self.pdf_width)
@ -239,27 +257,30 @@ class Lattice(BaseParser):
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
vertical_mask, vertical_segments = find_lines( if self.table_areas is None:
self.threshold, direction='vertical', regions = None
line_size_scaling=self.line_size_scaling, iterations=self.iterations) if self.table_regions is not None:
horizontal_mask, horizontal_segments = find_lines( regions = scale_areas(self.table_regions)
self.threshold, direction='horizontal',
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
if self.table_areas is not None: vertical_mask, vertical_segments = find_lines(
areas = [] self.threshold, regions=regions, direction='vertical',
for area in self.table_areas: line_scale=self.line_scale, iterations=self.iterations)
x1, y1, x2, y2 = area.split(",") horizontal_mask, horizontal_segments = find_lines(
x1 = float(x1) self.threshold, regions=regions, direction='horizontal',
y1 = float(y1) line_scale=self.line_scale, iterations=self.iterations)
x2 = float(x2)
y2 = float(y2) contours = find_contours(vertical_mask, horizontal_mask)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
else: else:
contours = find_table_contours(vertical_mask, horizontal_mask) vertical_mask, vertical_segments = find_lines(
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) self.threshold, direction='vertical', line_scale=self.line_scale,
iterations=self.iterations)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal', line_scale=self.line_scale,
iterations=self.iterations)
areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox_unscaled = copy.deepcopy(table_bbox)

View File

@ -26,6 +26,10 @@ class Stream(BaseParser):
Parameters Parameters
---------- ----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None) table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2 List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -51,9 +55,10 @@ class Stream(BaseParser):
to generate columns. to generate columns.
""" """
def __init__(self, table_areas=None, columns=None, split_text=False, def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_tol=50, row_tol=2, flag_size=False, strip_text='', edge_tol=50, row_tol=2,
column_tol=0, **kwargs): column_tol=0, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
@ -275,7 +280,18 @@ class Stream(BaseParser):
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.textedges = [] self.textedges = []
if self.table_areas is not None: if self.table_areas is None:
hor_text = self.horizontal_text
if self.table_regions is not None:
# filter horizontal text
hor_text = []
for region in self.table_regions:
x1, y1, x2, y2 = region
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
else:
table_bbox = {} table_bbox = {}
for area in self.table_areas: for area in self.table_areas:
x1, y1, x2, y2 = area.split(",") x1, y1, x2, y2 = area.split(",")
@ -284,9 +300,6 @@ class Stream(BaseParser):
x2 = float(x2) x2 = float(x2)
y2 = float(y2) y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None table_bbox[(x1, y2, x2, y1)] = None
else:
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):

View File

@ -101,7 +101,7 @@ stream_kwargs = [
] ]
lattice_kwargs = [ lattice_kwargs = [
'process_background', 'process_background',
'line_size_scaling', 'line_scale',
'copy_text', 'copy_text',
'shift_text', 'shift_text',
'line_tol', 'line_tol',
@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
---------- ----------
bbox : tuple bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space. space.
text : List of PDFMiner text objects. text : List of PDFMiner text objects.

View File

@ -0,0 +1,4 @@
"Età dellAssicuratoallepoca del decesso","Misura % dimaggiorazione"
"18-75","1,00%"
"76-80","0,50%"
"81 in poi","0,10%"
1 Età dell’Assicuratoall’epoca del decesso Misura % dimaggiorazione
2 18-75 1,00%
3 76-80 0,50%
4 81 in poi 0,10%

Binary file not shown.

View File

@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
Specify table areas Specify table areas
------------------- -------------------
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument. Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
:: ::
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
.. csv-table:: .. csv-table::
:file: ../_static/csv/table_areas.csv :file: ../_static/csv/table_areas.csv
Specify table regions
---------------------
However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
::
>>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
>>> tables[0].df
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot lattice -R 170,370,560,270 table_regions.pdf
.. csv-table::
:file: ../_static/csv/table_regions.csv
Specify column separators Specify column separators
------------------------- -------------------------
@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
Detect short lines Detect short lines
------------------ ------------------
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. .. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
@ -458,11 +477,11 @@ Let's plot the table for this PDF.
:alt: A plot of the PDF table with short lines :alt: A plot of the PDF table with short lines
:align: left :align: left
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again. Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
>>> camelot.plot(tables[0], kind='grid') >>> camelot.plot(tables[0], kind='grid')
>>> plt.show() >>> plt.show()
@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['']) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
>>> tables[0].df >>> tables[0].df
.. tip:: .. tip::

View File

@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"] ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
] ]
data_lattice_table_regions = [
['Età dellAssicurato \nallepoca del decesso', 'Misura % di \nmaggiorazione'],
['18-75', '1,00%'],
['76-80', '0,50%'],
['81 in poi', '0,10%']
]
data_lattice_table_areas = [ data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""], ["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""],

Binary file not shown.

View File

@ -159,6 +159,14 @@ def test_lattice_two_tables():
assert df2.equals(tables[1].df) assert df2.equals(tables[1].df)
def test_lattice_table_regions():
df = pd.DataFrame(data_lattice_table_regions)
filename = os.path.join(testdir, "table_region.pdf")
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
assert df.equals(tables[0].df)
def test_lattice_table_areas(): def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas) df = pd.DataFrame(data_lattice_table_areas)
@ -179,7 +187,7 @@ def test_lattice_copy_text():
df = pd.DataFrame(data_lattice_copy_text) df = pd.DataFrame(data_lattice_copy_text)
filename = os.path.join(testdir, "row_span_1.pdf") filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -189,13 +197,13 @@ def test_lattice_shift_text():
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
filename = os.path.join(testdir, "column_span_2.pdf") filename = os.path.join(testdir, "column_span_2.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=40) tables = camelot.read_pdf(filename, line_scale=40)
assert df_lt.equals(tables[0].df) assert df_lt.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
assert df_disable.equals(tables[0].df) assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
assert df_rb.equals(tables[0].df) assert df_rb.equals(tables[0].df)