Package etl :: Package component :: Package transform :: Module schema_validator
[hide private]
[frames] | no frames]

Source Code for Module etl.component.transform.schema_validator

  1  # -*- encoding: utf-8 -*- 
  2  ############################################################################## 
  3  # 
  4  #    ETL system- Extract Transfer Load system 
  5  #    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). All Rights Reserved 
  6  #    $Id$ 
  7  # 
  8  #    This program is free software: you can redistribute it and/or modify 
  9  #    it under the terms of the GNU General Public License as published by 
 10  #    the Free Software Foundation, either version 3 of the License, or 
 11  #    (at your option) any later version. 
 12  # 
 13  #    This program is distributed in the hope that it will be useful, 
 14  #    but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  #    GNU General Public License for more details. 
 17  # 
 18  #    You should have received a copy of the GNU General Public License 
 19  #    along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 20  # 
 21  ############################################################################## 
 22  """ 
 23   To perform Schema Validation. 
 24   
 25   Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>).  
 26   GNU General Public License. 
 27  """ 
 28  import types 
 29  from etl.component import component 
 30  import datetime 
 31  import time 
32 -class schema_validator(component):
33 """ 34 This is an ETL Component that performs Schema Validation. 35 36 Type : Data Component. 37 Computing Performance : Semi-Streamline. 38 Input Flows : 1. 39 * .* : The main data flow with input data. 40 Output Flows : 0-x. 41 * .* : Returns the main flow with Schema Validation Result. 42 * invalid_field : Returns data which has more or less fields. 43 * invalid_name : Returns data which has wrong field name. 44 * invalid_key : Returns data which does not match unique constraint. 45 * invalid_null : Returns data which does not match null constraint 46 * invalid_type : Returns data that fields have invalid type. 47 * invalid_size : Returns data which has more size. 48 * invalid_format : Returns data which does not match with format. 49 50 """ 51
52 - def __init__(self, schema, name='component.transform.schema_validator'):
53 """ 54 Required Parameters 55 schema : The name of schema 56 57 Extra Parameters 58 name : Name of Component. 59 """ 60 super(schema_validator, self).__init__(name=name) 61 self._type = 'component.transfer.schema_validator' 62 self.schema = schema
63
64 - def __copy__(self):
65 res = schema_validator(self.schema, self.name) 66 return res
67
68 - def process(self):
69 for channel, trans in self.input_get().items(): 70 for iterator in trans: 71 keys = [] 72 for d in iterator: 73 if len(d.keys()) != len(self.schema.keys()): 74 yield d, 'invalid_field' 75 else: 76 channel = 'main' 77 for f in d: 78 if f not in self.schema: 79 channel = 'invalid_name' 80 break 81 if self.schema[f].get('key', False): 82 if not d[f]: 83 channel = "invalid_key" 84 break 85 if d[f] in keys: 86 channel = "invalid_key" 87 break 88 keys.append(d[f]) 89 90 if self.schema[f].get('Is_NULL', False): 91 if not d[f]: 92 channel = "invalid_null" 93 break 94 95 if self.schema[f].get('type', False): 96 if type(d[f]) != eval(self.schema[f]['type']): 97 channel = 'invalid_type' 98 break 99 100 if self.schema[f].get('format', False): 101 # TODO : improve this, 102 # USE : check format using input mask validation or regular expression 103 if self.schema[f]['type'] == "datetime.date" : 104 try : 105 a = time.strptime(str(d[f]), self.schema[f]['format']) 106 except ValueError, e: 107 channel = "invalid_format" 108 break 109 if self.schema[f].get('size', False): 110 if len(d[f]) > int(self.schema[f]['size']): 111 channel = 'invalid_size' 112 break 113 yield d, channel 114 115
116 -def test():
117 from etl_test import etl_test 118 from etl import transformer 119 input_part = [ 120 {'id': 1L, 'name': 'Fabien', 'active': True, 'birth_date': '2009-02-01', 'amount': 209.58}, 121 {'id': 2L, 'name': 'Luc', 'active': True, 'birth_date': '2007-02-01', 'amount': 211.25}, 122 {'id': 3L, 'name': 'Henry', 'active': True, 'birth_date': '2006-02-01', 'amount': 219.20}, 123 ] 124 schema= { 125 'id': {'type': 'long', 'key': True, 'Is_Null': True}, 126 'name': {'type': 'str', 'size': '10', 'Is_NULL': False}, 127 'active': {'type': 'bool', 'Is_NULL': False}, 128 'birth_date': {'type': 'datetime.date', 'Is_NULL': False, 'format': '%y-%m-%d'}, 129 'amount': {'type': 'float', 'Is_NULL': True} 130 } 131 test = etl_test.etl_component_test(schema_validator(schema)) 132 test.check_input({'main': input_part}) 133 print test.output()
134 135 if __name__ == '__main__': 136 test() 137