Package etl :: Package component :: Package transform :: Module diff'
[hide private]
[frames] | no frames]

Source Code for Module etl.component.transform.diff'

  1  # -*- encoding: utf-8 -*- 
  2  ############################################################################## 
  3  # 
  4  #    ETL system- Extract Transfer Load system 
  5  #    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). All Rights Reserved 
  6  #    $Id$ 
  7  # 
  8  #    This program is free software: you can redistribute it and/or modify 
  9  #    it under the terms of the GNU General Public License as published by 
 10  #    the Free Software Foundation, either version 3 of the License, or 
 11  #    (at your option) any later version. 
 12  # 
 13  #    This program is distributed in the hope that it will be useful, 
 14  #    but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  #    GNU General Public License for more details. 
 17  # 
 18  #    You should have received a copy of the GNU General Public License 
 19  #    along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 20  # 
 21  ############################################################################## 
 22  """ 
 23   Used to find difference between Data. 
 24   
 25   Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). 
 26   GNU General Public License. 
 27  """ 
 28   
 29  from etl.component import component 
 30   
31 -class diff(component):
32 """ 33 This is an ETL Component that finds difference. 34 Takes 2 flows in input and detect a difference between these two flows 35 using computed keys (based on data records) to compare elements that may not 36 have to be in the same order. 37 38 Type : Data Component. 39 Computing Performance : Semi-Streamline. 40 Input Flows : 2. 41 * main : The main flow. 42 * .* : The second flow. 43 Output Flows : 0-x. 44 * same : Returns all elements that are the same in both input flows. 45 * updated : Returns all updated elements 46 * removed : Returns all elements that where in main and not in the second flow. 47 * added : Returns all elements from the second flow that are not in the main channel. 48 """
49 - def __init__(self, keys, name='component.process.diff', transformer=None, row_limit=0):
50 """ 51 Required Parameters 52 keys : Keys for differentiating. 53 54 Extra Parameters 55 name : Name of Component. 56 """ 57 super(diff, self).__init__(name=name, transformer=transformer, row_limit=row_limit) 58 self._type = 'component.transfer.diff' 59 self.keys = keys 60 self.row = {} 61 self.diff = [] 62 self.same = []
63
64 - def __copy__(self):
65 res = diff(self.key, self.name, self.transformer, self.row_limit) 66 return res
67 68 # Return the key of a row
69 - def key_get(self, row):
70 result = [] 71 for k in self.keys: 72 result.append(row[k]) # should be chk...it add value not key here 73 return tuple(result)
74
75 - def process(self):
76 self.row = {} 77 for channel, transition in self.input_get().items(): 78 if channel not in self.row: 79 self.row[channel] = {} 80 other = None 81 for key in self.row.keys(): 82 if key <> channel: 83 other = key 84 break 85 for iterator in transition: 86 for r in iterator: 87 key = self.key_get(r) 88 if other and (key in self.row[other]): 89 if self.row[other][key] == r: 90 yield r, 'same' 91 else: 92 yield r, 'update' 93 del self.row[other][key] 94 else: 95 self.row[channel][key] = r 96 todo = ['add','remove'] 97 for k in self.row: 98 channel= todo.pop() 99 for v in self.row[k].values(): 100 yield v,channel
101
102 -def test():
103 from etl_test import etl_test 104 from etl import transformer 105 input_part = [ 106 {'id': 1L, 'name': 'Fabien', 'address': 'france'}, 107 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'}, 108 {'id': 3L, 'name': 'harshad', 'address': 'india'}, 109 ] 110 modify = [ 111 {'id': 1L, 'name': 'Fabien', 'address': 'india'}, 112 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'}, 113 {'id': 3L, 'name': 'harshad', 'address': 'india'}, 114 ] 115 116 add = [ 117 {'id': 4L, 'name': 'henry', 'address': 'spain'} 118 ] 119 120 121 modify = [ 122 {'id': 1L, 'name': 'Fabien', 'address': 'india'} 123 ] 124 125 remove = [ 126 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'}, 127 ] 128 test = etl_test.etl_component_test(diff(['id'])) 129 test.check_input({'main':input_part}) 130 test.check_output(modify, 'main') 131 print test.output()
132 133 if __name__ == '__main__': 134 test() 135