Ingredient: Using anonymizersΒΆ
This example shows a shelf with anonymizers. Anonymizers make a dataset look like it has random data by modifying the ingredient value.
[7]:
from examples_base import *
# The state anonymizer reverses the state name.
# The population anonymizer adds 1000
shelf = Shelf({
'state': Dimension(Census.state, anonymizer=lambda v: v[::-1]),
'population': Metric(func.sum(Census.pop2000), anonymizer=lambda v: v+1000)
})
# To use the anonymizer, the extension class Anonymize must be used.
# and the anonymize flag must be True
recipe = Recipe(shelf=shelf, session=oven.Session(), extension_classes=[Anonymize])\
.dimensions('state').metrics('population').anonymize(True)
# Look at the output.
print(recipe.to_sql())
recipe.dataset.df
SELECT census.state AS state_raw,
sum(census.pop2000) AS population_raw
FROM census
GROUP BY census.state
[7]:
state_raw | population_raw | state | state_id | population | |
---|---|---|---|---|---|
0 | Alabama | 4438559 | amabalA | Alabama | 4439559 |
1 | Alaska | 608588 | aksalA | Alaska | 609588 |
2 | Arizona | 5143931 | anozirA | Arizona | 5144931 |
3 | Arkansas | 2672434 | sasnakrA | Arkansas | 2673434 |
4 | California | 33829442 | ainrofilaC | California | 33830442 |
5 | Colorado | 4300877 | odaroloC | Colorado | 4301877 |
6 | Connecticut | 3403620 | tucitcennoC | Connecticut | 3404620 |
7 | Delaware | 782386 | erawaleD | Delaware | 783386 |
8 | District of Columbia | 568103 | aibmuloC fo tcirtsiD | District of Columbia | 569103 |
9 | Florida | 15976093 | adirolF | Florida | 15977093 |
10 | Georgia | 8161776 | aigroeG | Georgia | 8162776 |
11 | Hawaii | 1167027 | iiawaH | Hawaii | 1168027 |
12 | Idaho | 1294560 | ohadI | Idaho | 1295560 |
13 | Illinois | 12405300 | sionillI | Illinois | 12406300 |
14 | Indiana | 6089161 | anaidnI | Indiana | 6090161 |
15 | Iowa | 2926878 | awoI | Iowa | 2927878 |
16 | Kansas | 2675929 | sasnaK | Kansas | 2676929 |
17 | Kentucky | 4028389 | ykcutneK | Kentucky | 4029389 |
18 | Louisiana | 4449489 | anaisiuoL | Louisiana | 4450489 |
19 | Maine | 1271694 | eniaM | Maine | 1272694 |
20 | Maryland | 5274039 | dnalyraM | Maryland | 5275039 |
21 | Massachusetts | 6357515 | sttesuhcassaM | Massachusetts | 6358515 |
22 | Michigan | 9951873 | nagihciM | Michigan | 9952873 |
23 | Minnesota | 4931897 | atosenniM | Minnesota | 4932897 |
24 | Mississippi | 2830133 | ippississiM | Mississippi | 2831133 |
25 | Missouri | 5588759 | iruossiM | Missouri | 5589759 |
26 | Montana | 899459 | anatnoM | Montana | 900459 |
27 | Nebraska | 1705040 | aksarbeN | Nebraska | 1706040 |
28 | Nevada | 2010272 | adaveN | Nevada | 2011272 |
29 | New Hampshire | 1239307 | erihspmaH weN | New Hampshire | 1240307 |
30 | New Jersey | 8420023 | yesreJ weN | New Jersey | 8421023 |
31 | New Mexico | 1809015 | ocixeM weN | New Mexico | 1810015 |
32 | New York | 18978668 | kroY weN | New York | 18979668 |
33 | North Carolina | 7978581 | aniloraC htroN | North Carolina | 7979581 |
34 | North Dakota | 633621 | atokaD htroN | North Dakota | 634621 |
35 | Ohio | 11355210 | oihO | Ohio | 11356210 |
36 | Oklahoma | 3430420 | amohalkO | Oklahoma | 3431420 |
37 | Oregon | 3428319 | nogerO | Oregon | 3429319 |
38 | Pennsylvania | 12276157 | ainavlysnneP | Pennsylvania | 12277157 |
39 | Rhode Island | 1047200 | dnalsI edohR | Rhode Island | 1048200 |
40 | South Carolina | 3983917 | aniloraC htuoS | South Carolina | 3984917 |
41 | South Dakota | 752231 | atokaD htuoS | South Dakota | 753231 |
42 | Tennessee | 5685230 | eessenneT | Tennessee | 5686230 |
43 | Texas | 20830810 | saxeT | Texas | 20831810 |
44 | Utah | 2238675 | hatU | Utah | 2239675 |
45 | Vermont | 609480 | tnomreV | Vermont | 610480 |
46 | Virginia | 6955790 | ainigriV | Virginia | 6956790 |
47 | Washington | 5863102 | notgnihsaW | Washington | 5864102 |
48 | West Virginia | 1805847 | ainigriV tseW | West Virginia | 1806847 |
49 | Wisconsin | 5372159 | nisnocsiW | Wisconsin | 5373159 |
50 | Wyoming | 490336 | gnimoyW | Wyoming | 491336 |
[8]:
from examples_base import *
# The state anonymizer reverses the state name.
# The population anonymizer adds 1000
shelf = Shelf({
'state': Dimension(Census.state, anonymizer='{fake:name}'),
'population': Metric(func.sum(Census.pop2000), anonymizer=lambda v: v+1000)
})
# To use the anonymizer, the extension class Anonymize must be used.
# and the anonymize flag must be True
recipe = Recipe(shelf=shelf, session=oven.Session(), extension_classes=[Anonymize])\
.dimensions('state').metrics('population').anonymize(True)
# Look at the output.
print(recipe.to_sql())
recipe.dataset.df
SELECT census.state AS state_raw,
sum(census.pop2000) AS population_raw
FROM census
GROUP BY census.state
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-8-cc78427f7b0f> in <module>
15 # Look at the output.
16 print(recipe.to_sql())
---> 17 recipe.dataset.df
~/dev/py/recipe/recipe/core.py in dataset(self)
558 @property
559 def dataset(self):
--> 560 rows = self.all()
561 if rows:
562 first_row = rows[0]
~/dev/py/recipe/recipe/core.py in all(self)
531
532 self._all = self._cauldron.enchant(
--> 533 self._query.all(), cache_context=self.cache_context
534 )
535 enchanttime = time.time()
~/dev/py/recipe/recipe/shelf.py in enchant(self, list, cache_context)
654 # Iterate over the results and build a new namedtuple for each row
655 for row in list:
--> 656 values = row + tuple(fn(row) for fn in extra_callables)
657 enchantedlist.append(keyed_tuple(values))
658
~/dev/py/recipe/recipe/shelf.py in <genexpr>(.0)
654 # Iterate over the results and build a new namedtuple for each row
655 for row in list:
--> 656 values = row + tuple(fn(row) for fn in extra_callables)
657 enchantedlist.append(keyed_tuple(values))
658
~/dev/py/recipe/recipe/ingredients.py in <lambda>(row)
167 raw_property = self.id + '_raw'
168 yield self.id, lambda row: \
--> 169 self._format_value(getattr(row, raw_property))
170
171 def _order(self):
~/dev/py/recipe/recipe/ingredients.py in _format_value(self, value)
118 """Formats value using any stored formatters. """
119 for f in self.formatters:
--> 120 value = f(value)
121 return value
122
~/dev/py/recipe/recipe/utils.py in __call__(self, value)
265 def __call__(self, value):
266 self.fake.seed_instance(hash(value))
--> 267 value = self.formatter.format(self.format_str, fake=self.fake)
268 if self.postprocessor is None:
269 return value
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in format(*args, **kwargs)
184 raise TypeError("format() missing 1 required positional "
185 "argument: 'format_string'") from None
--> 186 return self.vformat(format_string, args, kwargs)
187
188 def vformat(self, format_string, args, kwargs):
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in vformat(self, format_string, args, kwargs)
188 def vformat(self, format_string, args, kwargs):
189 used_args = set()
--> 190 result, _ = self._vformat(format_string, args, kwargs, used_args, 2)
191 self.check_unused_args(used_args, args, kwargs)
192 return result
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in _vformat(self, format_string, args, kwargs, used_args, recursion_depth, auto_arg_index)
228 # given the field_name, find the object it references
229 # and the argument it came from
--> 230 obj, arg_used = self.get_field(field_name, args, kwargs)
231 used_args.add(arg_used)
232
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in get_field(self, field_name, args, kwargs)
293 first, rest = _string.formatter_field_name_split(field_name)
294
--> 295 obj = self.get_value(first, args, kwargs)
296
297 # loop through the rest of the field_name, doing
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in get_value(self, key, args, kwargs)
250 return args[key]
251 else:
--> 252 return kwargs[key]
253
254
KeyError: 'first_name'
Using an anonymizer injects the anonymize function as the last formatter. The original value is available as ingredient_raw
.