diff --git a/camelot/lattice.py b/camelot/lattice.py index 079e2a0..57b4c12 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -162,6 +162,12 @@ class Lattice: different cells. (optional, default: False) + flag_size : bool + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + (optional, default: True) + shift_text : list {'l', 'r', 't', 'b'} Select one or more from above and pass them as a list to @@ -176,17 +182,19 @@ class Lattice: """ def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], scale=15, invert=False, margins=(1.0, 0.5, 0.1), - split_text=False, shift_text=['l', 't'], debug=None): + split_text=False, flag_size=True, shift_text=['l', 't'], + debug=None): self.method = 'lattice' self.table_area = table_area self.fill = fill - self.headers = [h.split(',') for h in headers] + self.headers = headers self.mtol = mtol self.scale = scale self.invert = invert self.char_margin, self.line_margin, self.word_margin = margins self.split_text = split_text + self.flag_size = flag_size self.shift_text = shift_text self.debug = debug @@ -248,6 +256,7 @@ class Lattice: if self.headers is not None: if len(self.table_area) != len(self.headers): raise ValueError("Length of headers should be equal to table_area.") + self.headers = [h.split(',') for h in headers] areas = [] for area in self.table_area: @@ -329,13 +338,20 @@ class Lattice: self.debug_tables.append(table) assignment_errors = [] + table_data['split_text'] = [] + table_data['superscript'] = [] for direction in t_bbox: for t in t_bbox[direction]: indices, error = get_table_index( - table, t, direction, split_text=self.split_text) + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) assignment_errors.append(error) - indices = _reduce_index(table, indices, shift_text=self.shift_text) + indices = _reduce_index(table, indices, shift_text=self.shift_text,) + if len(indices) > 1: + table_data['split_text'].append(indices) for r_idx, c_idx, text in indices: + if all(s in text for s in ['', '']): + table_data['superscript'].append((r_idx, c_idx, text)) table.cells[r_idx][c_idx].add_text(text) score = get_score([[100, assignment_errors]]) table_data['score'] = score diff --git a/camelot/stream.py b/camelot/stream.py index 04c3f06..d1ab29b 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -259,6 +259,12 @@ class Stream: different cells. (optional, default: False) + flag_size : bool + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + (optional, default: True) + debug : bool Set to True to generate a matplotlib plot of LTTextLineHorizontals in order to select table_area, columns. @@ -266,7 +272,7 @@ class Stream: """ def __init__(self, table_area=None, columns=None, ncolumns=None, headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), - split_text=False, debug=False): + split_text=False, flag_size=True, debug=False): self.method = 'stream' self.table_area = table_area @@ -274,9 +280,10 @@ class Stream: self.ncolumns = ncolumns self.ytol = ytol self.mtol = mtol - self.headers = [h.split(',') for h in headers] + self.headers = headers self.char_margin, self.line_margin, self.word_margin = margins self.split_text = split_text + self.flag_size = flag_size self.debug = debug def get_tables(self, pdfname): @@ -318,6 +325,7 @@ class Stream: if self.headers is not None: if len(self.table_area) != len(self.headers): raise ValueError("Length of headers should be equal to table_area.") + self.headers = [h.split(',') for h in headers] table_bbox = {} for area in self.table_area: @@ -418,12 +426,19 @@ class Stream: table = Table(cols, rows) table = table.set_all_edges() assignment_errors = [] + table_data['split_text'] = [] + table_data['superscript'] = [] for direction in t_bbox: for t in t_bbox[direction]: indices, error = get_table_index( - table, t, direction, split_text=self.split_text) + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) assignment_errors.append(error) + if len(indices) > 1: + table_data['split_text'].append(indices) for r_idx, c_idx, text in indices: + if all(s in text for s in ['', '']): + table_data['superscript'].append((r_idx, c_idx, text)) table.cells[r_idx][c_idx].add_text(text) if guess: score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]]) diff --git a/camelot/utils.py b/camelot/utils.py index e798dd8..91e19bc 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,6 +1,8 @@ from __future__ import division import os import logging +from itertools import groupby +from operator import itemgetter import numpy as np @@ -500,7 +502,49 @@ def merge_close_values(ar, mtol=2): return ret -def split_textline(table, textline, direction): +def flag_on_size(textline, direction): + """Flags a super/subscript by enclosing it with . May give + false positives. + + Parameters + ---------- + textline : list + List of PDFMiner LTChar objects. + + direction : string + {'horizontal', 'vertical'} + Direction of the PDFMiner LTTextLine object. + + Returns + ------- + fstring : string + """ + if direction == 'horizontal': + d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] + elif direction == 'vertical': + d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)] + l = [np.round(size, decimals=6) for text, size in d] + if len(set(l)) > 1: + flist = [] + min_size = min(l) + for key, chars in groupby(d, itemgetter(1)): + if key == min_size: + fchars = [t[0] for t in chars] + if ''.join(fchars).strip(): + fchars.insert(0, '') + fchars.append('') + flist.append(''.join(fchars)) + else: + fchars = [t[0] for t in chars] + if ''.join(fchars).strip(): + flist.append(''.join(fchars)) + fstring = ''.join(flist).strip('\n') + else: + fstring = ''.join([t.get_text() for t in textline]).strip('\n') + return fstring + + +def split_textline(table, textline, direction, flag_size=True): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -516,9 +560,15 @@ def split_textline(table, textline, direction): {'horizontal', 'vertical'} Direction of the PDFMiner LTTextLine object. + flag_size : bool + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + (optional, default: True) + Returns ------- - cut_text : list + grouped_chars : list List of tuples of the form (idx, text) where idx is the index of row/column and text is the an lttextline substring. """ @@ -538,10 +588,10 @@ def split_textline(table, textline, direction): if isinstance(obj, LTChar): if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and (obj.x0 + obj.x1) / 2 <= cut[1]): - cut_text.append((r, cut[0], obj.get_text().strip('\n'))) + cut_text.append((r, cut[0], obj)) break elif isinstance(obj, LTAnno): - cut_text.append((r, cut[0], obj.get_text().strip('\n'))) + cut_text.append((r, cut[0], obj)) elif direction == 'vertical' and not textline.is_empty(): y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] @@ -558,11 +608,18 @@ def split_textline(table, textline, direction): cut_text.append((cut[0], c, obj.get_text())) break elif isinstance(obj, LTAnno): - cut_text.append((cut[0], c, obj.get_text().strip('\n'))) - return cut_text + cut_text.append((cut[0], c, obj)) + grouped_chars = [] + for key, chars in groupby(cut_text, itemgetter(0, 1)): + if flag_size: + grouped_chars.append((key[0], key[1], flag_on_size([t[2] for t in chars], direction))) + else: + gchars = [t[2].get_text() for t in chars] + grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n'))) + return grouped_chars -def get_table_index(table, t, direction, split_text=False): +def get_table_index(table, t, direction, split_text=False, flag_size=True): """Gets indices of the cell where given text object lies by comparing their y and x-coordinates. @@ -583,6 +640,12 @@ def get_table_index(table, t, direction, split_text=False): multiple cells. (optional, default: False) + flag_size : bool + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + (optional, default: True) + Returns ------- indices : list @@ -632,9 +695,12 @@ def get_table_index(table, t, direction, split_text=False): error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea if split_text: - return split_textline(table, t, direction), error + return split_textline(table, t, direction, flag_size=flag_size), error else: - return [(r_idx, c_idx, t.get_text().strip('\n'))], error + if flag_size: + return [(r_idx, c_idx, flag_on_size(t._objs, direction))], error + else: + return [(r_idx, c_idx, t.get_text().strip('\n'))], error def get_score(error_weights): diff --git a/tests/test_stream.py b/tests/test_stream.py index a23ad08..aea1dbd 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -85,7 +85,7 @@ def test_stream_missing_value(): ["4","","","",""] ] pdfname = os.path.join(testdir, "missing_values.pdf") - manager = Pdf(Stream(), pdfname, clean=True) + manager = Pdf(Stream(flag_size=False), pdfname, clean=True) tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) @@ -210,11 +210,11 @@ def test_stream_table_rotation(): ["","","","","","","","","54","","","","","","","","",""] ] pdfname = os.path.join(testdir, "left_rotated_table_2.pdf") - manager = Pdf(Stream(), pdfname, clean=True) + manager = Pdf(Stream(flag_size=False), pdfname, clean=True) tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) pdfname = os.path.join(testdir, "right_rotated_table_2.pdf") - manager = Pdf(Stream(), pdfname, clean=True) + manager = Pdf(Stream(flag_size=False), pdfname, clean=True) tables = manager.extract() assert_equal(tables["page-1"]["table-1"]["data"], data) \ No newline at end of file