Merge pull request #243 from socialcopsdev/add-table-regions
[MRG] Add table regions supportpull/2/head
commit
7cf409aa08
|
|
@ -4,6 +4,13 @@ Release History
|
|||
master
|
||||
------
|
||||
|
||||
**Improvements**
|
||||
|
||||
* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
|
||||
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
|
||||
* Kwarg `line_size_scaling` is now called `line_scale`.
|
||||
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
|
||||
|
||||
0.6.0 (2018-12-24)
|
||||
------------------
|
||||
|
||||
|
|
|
|||
|
|
@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
|
|||
|
||||
|
||||
@cli.command('lattice')
|
||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-back', '--process_background', is_flag=True,
|
||||
help='Process background lines.')
|
||||
@click.option('-scale', '--line_size_scaling', default=15,
|
||||
@click.option('-scale', '--line_scale', default=15,
|
||||
help='Line size scaling factor. The larger the value,'
|
||||
' the smaller the detected lines.')
|
||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
||||
|
|
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
|
|||
filepath = kwargs.pop('filepath')
|
||||
kwargs.update(conf)
|
||||
|
||||
table_regions = list(kwargs['table_regions'])
|
||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs['table_areas'])
|
||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||
copy_text = list(kwargs['copy_text'])
|
||||
|
|
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
|
|||
|
||||
|
||||
@cli.command('stream')
|
||||
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
|
|
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
|
|||
filepath = kwargs.pop('filepath')
|
||||
kwargs.update(conf)
|
||||
|
||||
table_regions = list(kwargs['table_regions'])
|
||||
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||
table_areas = list(kwargs['table_areas'])
|
||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||
columns = list(kwargs['columns'])
|
||||
|
|
|
|||
|
|
@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
return img, threshold
|
||||
|
||||
|
||||
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
||||
def find_lines(threshold, regions=None, direction='horizontal',
|
||||
line_scale=15, iterations=0):
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
||||
|
|
@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
|||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
regions : list, optional (default: None)
|
||||
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in image coordinate space.
|
||||
direction : string, optional (default: 'horizontal')
|
||||
Specifies whether to find vertical or horizontal lines.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
line_scale : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
|
|
@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
|||
lines = []
|
||||
|
||||
if direction == 'vertical':
|
||||
size = threshold.shape[0] // line_size_scaling
|
||||
size = threshold.shape[0] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
elif direction == 'horizontal':
|
||||
size = threshold.shape[1] // line_size_scaling
|
||||
size = threshold.shape[1] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError("Specify direction as either 'vertical' or"
|
||||
" 'horizontal'")
|
||||
|
||||
if regions is not None:
|
||||
region_mask = np.zeros(threshold.shape)
|
||||
for region in regions:
|
||||
x, y, w, h = region
|
||||
region_mask[y : y + h, x : x + w] = 1
|
||||
threshold = np.multiply(threshold, region_mask)
|
||||
|
||||
threshold = cv2.erode(threshold, el)
|
||||
threshold = cv2.dilate(threshold, el)
|
||||
dmask = cv2.dilate(threshold, el, iterations=iterations)
|
||||
|
||||
try:
|
||||
_, contours, _ = cv2.findContours(
|
||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, _ = cv2.findContours(
|
||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for c in contours:
|
||||
x, y, w, h = cv2.boundingRect(c)
|
||||
|
|
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
|||
return dmask, lines
|
||||
|
||||
|
||||
def find_table_contours(vertical, horizontal):
|
||||
def find_contours(vertical, horizontal):
|
||||
"""Finds table boundaries using OpenCV's findContours.
|
||||
|
||||
Parameters
|
||||
|
|
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
|
|||
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
# sort in reverse based on contour area and use first 10 contours
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
cont = []
|
||||
|
|
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
|
|||
return cont
|
||||
|
||||
|
||||
def find_table_joints(contours, vertical, horizontal):
|
||||
def find_joints(contours, vertical, horizontal):
|
||||
"""Finds joints/intersections present inside each table boundary.
|
||||
|
||||
Parameters
|
||||
|
|
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
and (x2, y2) -> rt in image coordinate space.
|
||||
|
||||
"""
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
joints = np.multiply(vertical, horizontal)
|
||||
tables = {}
|
||||
for c in contours:
|
||||
x, y, w, h = c
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than 4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
|||
to generate columns.
|
||||
process_background* : bool, optional (default: False)
|
||||
Process background lines.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
line_scale* : int, optional (default: 15)
|
||||
Line size scaling factor. The larger the value the smaller
|
||||
the detected lines. Making it very large will lead to text
|
||||
being detected as lines.
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
|||
merge_close_lines, get_table_index, compute_accuracy,
|
||||
compute_whitespace)
|
||||
from ..image_processing import (adaptive_threshold, find_lines,
|
||||
find_table_contours, find_table_joints)
|
||||
find_contours, find_joints)
|
||||
|
||||
|
||||
logger = logging.getLogger('camelot')
|
||||
|
|
@ -28,13 +28,17 @@ class Lattice(BaseParser):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
table_regions : list, optional (default: None)
|
||||
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
table_areas : list, optional (default: None)
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
process_background : bool, optional (default: False)
|
||||
Process background lines.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
line_scale : int, optional (default: 15)
|
||||
Line size scaling factor. The larger the value the smaller
|
||||
the detected lines. Making it very large will lead to text
|
||||
being detected as lines.
|
||||
|
|
@ -77,14 +81,15 @@ class Lattice(BaseParser):
|
|||
Resolution used for PDF to PNG conversion.
|
||||
|
||||
"""
|
||||
def __init__(self, table_areas=None, process_background=False,
|
||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||
def __init__(self, table_regions=None, table_areas=None, process_background=False,
|
||||
line_scale=15, copy_text=None, shift_text=['l', 't'],
|
||||
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
||||
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||
iterations=0, resolution=300, **kwargs):
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.process_background = process_background
|
||||
self.line_size_scaling = line_size_scaling
|
||||
self.line_scale = line_scale
|
||||
self.copy_text = copy_text
|
||||
self.shift_text = shift_text
|
||||
self.split_text = split_text
|
||||
|
|
@ -227,9 +232,22 @@ class Lattice(BaseParser):
|
|||
stderr=subprocess.STDOUT)
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
def scale_areas(areas):
|
||||
scaled_areas = []
|
||||
for area in areas:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
|
||||
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
return scaled_areas
|
||||
|
||||
self.image, self.threshold = adaptive_threshold(
|
||||
self.imagename, process_background=self.process_background,
|
||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||
|
||||
image_width = self.image.shape[1]
|
||||
image_height = self.image.shape[0]
|
||||
image_width_scaler = image_width / float(self.pdf_width)
|
||||
|
|
@ -239,27 +257,30 @@ class Lattice(BaseParser):
|
|||
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
||||
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
||||
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, direction='vertical',
|
||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, direction='horizontal',
|
||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
||||
if self.table_areas is None:
|
||||
regions = None
|
||||
if self.table_regions is not None:
|
||||
regions = scale_areas(self.table_regions)
|
||||
|
||||
if self.table_areas is not None:
|
||||
areas = []
|
||||
for area in self.table_areas:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
|
||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, regions=regions, direction='vertical',
|
||||
line_scale=self.line_scale, iterations=self.iterations)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, regions=regions, direction='horizontal',
|
||||
line_scale=self.line_scale, iterations=self.iterations)
|
||||
|
||||
contours = find_contours(vertical_mask, horizontal_mask)
|
||||
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
||||
else:
|
||||
contours = find_table_contours(vertical_mask, horizontal_mask)
|
||||
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, direction='vertical', line_scale=self.line_scale,
|
||||
iterations=self.iterations)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, direction='horizontal', line_scale=self.line_scale,
|
||||
iterations=self.iterations)
|
||||
|
||||
areas = scale_areas(self.table_areas)
|
||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||
|
||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@ class Stream(BaseParser):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
table_regions : list, optional (default: None)
|
||||
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
table_areas : list, optional (default: None)
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
|
|
@ -51,9 +55,10 @@ class Stream(BaseParser):
|
|||
to generate columns.
|
||||
|
||||
"""
|
||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
|
||||
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
||||
column_tol=0, **kwargs):
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
|
|
@ -275,7 +280,18 @@ class Stream(BaseParser):
|
|||
|
||||
def _generate_table_bbox(self):
|
||||
self.textedges = []
|
||||
if self.table_areas is not None:
|
||||
if self.table_areas is None:
|
||||
hor_text = self.horizontal_text
|
||||
if self.table_regions is not None:
|
||||
# filter horizontal text
|
||||
hor_text = []
|
||||
for region in self.table_regions:
|
||||
x1, y1, x2, y2 = region
|
||||
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
|
||||
hor_text.extend(region_text)
|
||||
# find tables based on nurminen's detection algorithm
|
||||
table_bbox = self._nurminen_table_detection(hor_text)
|
||||
else:
|
||||
table_bbox = {}
|
||||
for area in self.table_areas:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
|
|
@ -284,9 +300,6 @@ class Stream(BaseParser):
|
|||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
else:
|
||||
# find tables based on nurminen's detection algorithm
|
||||
table_bbox = self._nurminen_table_detection(self.horizontal_text)
|
||||
self.table_bbox = table_bbox
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ stream_kwargs = [
|
|||
]
|
||||
lattice_kwargs = [
|
||||
'process_background',
|
||||
'line_size_scaling',
|
||||
'line_scale',
|
||||
'copy_text',
|
||||
'shift_text',
|
||||
'line_tol',
|
||||
|
|
@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
|
|||
----------
|
||||
bbox : tuple
|
||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||
space.
|
||||
text : List of PDFMiner text objects.
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
|
||||
"18-75","1,00%"
|
||||
"76-80","0,50%"
|
||||
"81 in poi","0,10%"
|
||||
|
Binary file not shown.
|
|
@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
|
|||
Specify table areas
|
||||
-------------------
|
||||
|
||||
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
|
||||
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
|
||||
|
||||
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
||||
|
||||
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
|
||||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
|
||||
|
|
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
|
|||
.. csv-table::
|
||||
:file: ../_static/csv/table_areas.csv
|
||||
|
||||
Specify table regions
|
||||
---------------------
|
||||
|
||||
However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
|
||||
|
||||
You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
|
||||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
|
||||
>>> tables[0].df
|
||||
|
||||
.. tip::
|
||||
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||
::
|
||||
|
||||
$ camelot lattice -R 170,370,560,270 table_regions.pdf
|
||||
|
||||
.. csv-table::
|
||||
:file: ../_static/csv/table_regions.csv
|
||||
|
||||
Specify column separators
|
||||
-------------------------
|
||||
|
||||
|
|
@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
|
|||
Detect short lines
|
||||
------------------
|
||||
|
||||
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
|
||||
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
|
||||
|
||||
As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
|
||||
As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
|
||||
|
||||
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
|
||||
.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
|
||||
|
||||
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
|
||||
|
||||
|
|
@ -458,11 +477,11 @@ Let's plot the table for this PDF.
|
|||
:alt: A plot of the PDF table with short lines
|
||||
:align: left
|
||||
|
||||
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
|
||||
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
|
||||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
|
||||
>>> camelot.plot(tables[0], kind='grid')
|
||||
>>> plt.show()
|
||||
|
||||
|
|
@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
|
|||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
|
||||
>>> tables[0].df
|
||||
|
||||
.. csv-table::
|
||||
|
|
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
|
|||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
|
||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
|
||||
>>> tables[0].df
|
||||
|
||||
.. tip::
|
||||
|
|
|
|||
|
|
@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
|
|||
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
|
||||
]
|
||||
|
||||
data_lattice_table_regions = [
|
||||
['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'],
|
||||
['18-75', '1,00%'],
|
||||
['76-80', '0,50%'],
|
||||
['81 in poi', '0,10%']
|
||||
]
|
||||
|
||||
data_lattice_table_areas = [
|
||||
["", "", "", "", "", "", "", "", ""],
|
||||
["State", "n", "Literacy Status", "", "", "", "", "", ""],
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -159,6 +159,14 @@ def test_lattice_two_tables():
|
|||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_lattice_table_regions():
|
||||
df = pd.DataFrame(data_lattice_table_regions)
|
||||
|
||||
filename = os.path.join(testdir, "table_region.pdf")
|
||||
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
|
||||
assert df.equals(tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_table_areas():
|
||||
df = pd.DataFrame(data_lattice_table_areas)
|
||||
|
||||
|
|
@ -179,7 +187,7 @@ def test_lattice_copy_text():
|
|||
df = pd.DataFrame(data_lattice_copy_text)
|
||||
|
||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
|
||||
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
|
||||
assert df.equals(tables[0].df)
|
||||
|
||||
|
||||
|
|
@ -189,13 +197,13 @@ def test_lattice_shift_text():
|
|||
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
||||
|
||||
filename = os.path.join(testdir, "column_span_2.pdf")
|
||||
tables = camelot.read_pdf(filename, line_size_scaling=40)
|
||||
tables = camelot.read_pdf(filename, line_scale=40)
|
||||
assert df_lt.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
|
||||
assert df_disable.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
|
||||
assert df_rb.equals(tables[0].df)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue