From e8b93a9624e3cfa7eee960db80232d847bf68034 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 12 Oct 2016 13:59:10 +0530 Subject: [PATCH] Add headers param --- camelot/lattice.py | 27 +++++++++++++++++++++++---- camelot/stream.py | 26 ++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/camelot/lattice.py b/camelot/lattice.py index dc34b28..079e2a0 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -124,7 +124,7 @@ class Lattice: Parameters ---------- table_area : list - List of tuples of the form (x1, y1, x2, y2) where + List of strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's coordinate space, denoting table areas to analyze. (optional, default: None) @@ -135,6 +135,10 @@ class Lattice: or both directions. (optional, default: None) + headers : list + List of strings where each string is a csv header for a table. + (optional, default: None) + mtol : list List of ints specifying m-tolerance parameters. (optional, default: [2]) @@ -170,13 +174,14 @@ class Lattice: of detected contours, lines, joints and the table generated. (optional, default: None) """ - def __init__(self, table_area=None, fill=None, mtol=[2], scale=15, - invert=False, margins=(1.0, 0.5, 0.1), split_text=False, - shift_text=['l', 't'], debug=None): + def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], + scale=15, invert=False, margins=(1.0, 0.5, 0.1), + split_text=False, shift_text=['l', 't'], debug=None): self.method = 'lattice' self.table_area = table_area self.fill = fill + self.headers = [h.split(',') for h in headers] self.mtol = mtol self.scale = scale self.invert = invert @@ -240,6 +245,10 @@ class Lattice: if self.fill is not None: if len(self.table_area) != len(self.fill): raise ValueError("Length of fill should be equal to table_area.") + if self.headers is not None: + if len(self.table_area) != len(self.headers): + raise ValueError("Length of headers should be equal to table_area.") + areas = [] for area in self.table_area: x1, y1, x2, y2 = area.split(",") @@ -297,6 +306,14 @@ class Lattice: for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + + if self.headers is not None and len(self.headers[table_no]) != len(cols): + logging.warning("Length of header ({0}) specified for table is not" + " equal to the number of columns ({1}) detected.".format( + len(self.headers[table_no]), len(cols))) + while len(self.headers[table_no]) != len(cols): + self.headers[table_no].append('') + rows, cols = rotate_table(rows, cols, table_rotation) table = Table(cols, rows) # set table edges to True using ver+hor lines @@ -326,6 +343,8 @@ class Lattice: if self.fill is not None: table = _fill_spanning(table, fill=self.fill[table_no]) ar = table.get_list() + if self.headers is not None and self.headers[table_no] != ['']: + ar.insert(0, self.headers[table_no]) ar = encode_list(ar) table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) diff --git a/camelot/stream.py b/camelot/stream.py index dfc74b6..04c3f06 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -224,7 +224,7 @@ class Stream: Parameters ---------- table_area : list - List of tuples of the form (x1, y1, x2, y2) where + List of strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's coordinate space, denoting table areas to analyze. (optional, default: None) @@ -238,6 +238,10 @@ class Stream: List of ints specifying the number of columns in each table. (optional, default: None) + headers : list + List of strings where each string is a csv header for a table. + (optional, default: None) + ytol : list List of ints specifying the y-tolerance parameters. (optional, default: [2]) @@ -260,9 +264,9 @@ class Stream: LTTextLineHorizontals in order to select table_area, columns. (optional, default: False) """ - def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2], - mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False, - debug=False): + def __init__(self, table_area=None, columns=None, ncolumns=None, + headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), + split_text=False, debug=False): self.method = 'stream' self.table_area = table_area @@ -270,6 +274,7 @@ class Stream: self.ncolumns = ncolumns self.ytol = ytol self.mtol = mtol + self.headers = [h.split(',') for h in headers] self.char_margin, self.line_margin, self.word_margin = margins self.split_text = split_text self.debug = debug @@ -310,6 +315,10 @@ class Stream: if self.ncolumns is not None: if len(self.table_area) != len(self.ncolumns): raise ValueError("Length of ncolumns should be equal to table_area.") + if self.headers is not None: + if len(self.table_area) != len(self.headers): + raise ValueError("Length of headers should be equal to table_area.") + table_bbox = {} for area in self.table_area: x1, y1, x2, y2 = area.split(",") @@ -399,6 +408,13 @@ class Stream: cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _join_columns(cols, text_x_min, text_x_max) + if self.headers is not None and len(self.headers[table_no]) != len(cols): + logging.warning("Length of header ({0}) specified for table is not" + " equal to the number of columns ({1}) detected.".format( + len(self.headers[table_no]), len(cols))) + while len(self.headers[table_no]) != len(cols): + self.headers[table_no].append('') + table = Table(cols, rows) table = table.set_all_edges() assignment_errors = [] @@ -416,6 +432,8 @@ class Stream: table_data['score'] = score ar = table.get_list() + if self.headers is not None and self.headers[table_no] != ['']: + ar.insert(0, self.headers[table_no]) ar = encode_list(ar) table_data['data'] = ar empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)