1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """
23 Used to find difference between Data.
24
25 Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>).
26 GNU General Public License.
27 """
28
29 from etl.component import component
30
31 -class diff(component):
32 """
33 This is an ETL Component that finds difference.
34 Takes 2 flows in input and detect a difference between these two flows
35 using computed keys (based on data records) to compare elements that may not
36 have to be in the same order.
37
38 Type : Data Component.
39 Computing Performance : Semi-Streamline.
40 Input Flows : 2.
41 * main : The main flow.
42 * .* : The second flow.
43 Output Flows : 0-x.
44 * same : Returns all elements that are the same in both input flows.
45 * updated : Returns all updated elements
46 * removed : Returns all elements that where in main and not in the second flow.
47 * added : Returns all elements from the second flow that are not in the main channel.
48 """
49 - def __init__(self, keys, name='component.process.diff', transformer=None, row_limit=0):
50 """
51 Required Parameters
52 keys : Keys for differentiating.
53
54 Extra Parameters
55 name : Name of Component.
56 """
57 super(diff, self).__init__(name=name, transformer=transformer, row_limit=row_limit)
58 self._type = 'component.transfer.diff'
59 self.keys = keys
60 self.row = {}
61 self.diff = []
62 self.same = []
63
65 res = diff(self.key, self.name, self.transformer, self.row_limit)
66 return res
67
68
70 result = []
71 for k in self.keys:
72 result.append(row[k])
73 return tuple(result)
74
76 self.row = {}
77 for channel, transition in self.input_get().items():
78 if channel not in self.row:
79 self.row[channel] = {}
80 other = None
81 for key in self.row.keys():
82 if key <> channel:
83 other = key
84 break
85 for iterator in transition:
86 for r in iterator:
87 key = self.key_get(r)
88 if other and (key in self.row[other]):
89 if self.row[other][key] == r:
90 yield r, 'same'
91 else:
92 yield r, 'update'
93 del self.row[other][key]
94 else:
95 self.row[channel][key] = r
96 todo = ['add','remove']
97 for k in self.row:
98 channel= todo.pop()
99 for v in self.row[k].values():
100 yield v,channel
101
103 from etl_test import etl_test
104 from etl import transformer
105 input_part = [
106 {'id': 1L, 'name': 'Fabien', 'address': 'france'},
107 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'},
108 {'id': 3L, 'name': 'harshad', 'address': 'india'},
109 ]
110 modify = [
111 {'id': 1L, 'name': 'Fabien', 'address': 'india'},
112 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'},
113 {'id': 3L, 'name': 'harshad', 'address': 'india'},
114 ]
115
116 add = [
117 {'id': 4L, 'name': 'henry', 'address': 'spain'}
118 ]
119
120
121 modify = [
122 {'id': 1L, 'name': 'Fabien', 'address': 'india'}
123 ]
124
125 remove = [
126 {'id': 1L, 'name': 'Fabien', 'address': 'belgium'},
127 ]
128 test = etl_test.etl_component_test(diff(['id']))
129 test.check_input({'main':input_part})
130 test.check_output(modify, 'main')
131 print test.output()
132
133 if __name__ == '__main__':
134 test()
135