Ingredient: Using anonymizersΒΆ

This example shows a shelf with anonymizers. Anonymizers make a dataset look like it has random data by modifying the ingredient value.

[7]:
from examples_base import *

# The state anonymizer reverses the state name.
# The population anonymizer adds 1000
shelf = Shelf({
    'state': Dimension(Census.state, anonymizer=lambda v: v[::-1]),
    'population': Metric(func.sum(Census.pop2000), anonymizer=lambda v: v+1000)
})

# To use the anonymizer, the extension class Anonymize must be used.
# and the anonymize flag must be True
recipe = Recipe(shelf=shelf, session=oven.Session(), extension_classes=[Anonymize])\
    .dimensions('state').metrics('population').anonymize(True)

# Look at the output.
print(recipe.to_sql())
recipe.dataset.df
SELECT census.state AS state_raw,
       sum(census.pop2000) AS population_raw
FROM census
GROUP BY census.state
[7]:
state_raw population_raw state state_id population
0 Alabama 4438559 amabalA Alabama 4439559
1 Alaska 608588 aksalA Alaska 609588
2 Arizona 5143931 anozirA Arizona 5144931
3 Arkansas 2672434 sasnakrA Arkansas 2673434
4 California 33829442 ainrofilaC California 33830442
5 Colorado 4300877 odaroloC Colorado 4301877
6 Connecticut 3403620 tucitcennoC Connecticut 3404620
7 Delaware 782386 erawaleD Delaware 783386
8 District of Columbia 568103 aibmuloC fo tcirtsiD District of Columbia 569103
9 Florida 15976093 adirolF Florida 15977093
10 Georgia 8161776 aigroeG Georgia 8162776
11 Hawaii 1167027 iiawaH Hawaii 1168027
12 Idaho 1294560 ohadI Idaho 1295560
13 Illinois 12405300 sionillI Illinois 12406300
14 Indiana 6089161 anaidnI Indiana 6090161
15 Iowa 2926878 awoI Iowa 2927878
16 Kansas 2675929 sasnaK Kansas 2676929
17 Kentucky 4028389 ykcutneK Kentucky 4029389
18 Louisiana 4449489 anaisiuoL Louisiana 4450489
19 Maine 1271694 eniaM Maine 1272694
20 Maryland 5274039 dnalyraM Maryland 5275039
21 Massachusetts 6357515 sttesuhcassaM Massachusetts 6358515
22 Michigan 9951873 nagihciM Michigan 9952873
23 Minnesota 4931897 atosenniM Minnesota 4932897
24 Mississippi 2830133 ippississiM Mississippi 2831133
25 Missouri 5588759 iruossiM Missouri 5589759
26 Montana 899459 anatnoM Montana 900459
27 Nebraska 1705040 aksarbeN Nebraska 1706040
28 Nevada 2010272 adaveN Nevada 2011272
29 New Hampshire 1239307 erihspmaH weN New Hampshire 1240307
30 New Jersey 8420023 yesreJ weN New Jersey 8421023
31 New Mexico 1809015 ocixeM weN New Mexico 1810015
32 New York 18978668 kroY weN New York 18979668
33 North Carolina 7978581 aniloraC htroN North Carolina 7979581
34 North Dakota 633621 atokaD htroN North Dakota 634621
35 Ohio 11355210 oihO Ohio 11356210
36 Oklahoma 3430420 amohalkO Oklahoma 3431420
37 Oregon 3428319 nogerO Oregon 3429319
38 Pennsylvania 12276157 ainavlysnneP Pennsylvania 12277157
39 Rhode Island 1047200 dnalsI edohR Rhode Island 1048200
40 South Carolina 3983917 aniloraC htuoS South Carolina 3984917
41 South Dakota 752231 atokaD htuoS South Dakota 753231
42 Tennessee 5685230 eessenneT Tennessee 5686230
43 Texas 20830810 saxeT Texas 20831810
44 Utah 2238675 hatU Utah 2239675
45 Vermont 609480 tnomreV Vermont 610480
46 Virginia 6955790 ainigriV Virginia 6956790
47 Washington 5863102 notgnihsaW Washington 5864102
48 West Virginia 1805847 ainigriV tseW West Virginia 1806847
49 Wisconsin 5372159 nisnocsiW Wisconsin 5373159
50 Wyoming 490336 gnimoyW Wyoming 491336
[8]:
from examples_base import *

# The state anonymizer reverses the state name.
# The population anonymizer adds 1000
shelf = Shelf({
    'state': Dimension(Census.state, anonymizer='{fake:name}'),
    'population': Metric(func.sum(Census.pop2000), anonymizer=lambda v: v+1000)
})

# To use the anonymizer, the extension class Anonymize must be used.
# and the anonymize flag must be True
recipe = Recipe(shelf=shelf, session=oven.Session(), extension_classes=[Anonymize])\
    .dimensions('state').metrics('population').anonymize(True)

# Look at the output.
print(recipe.to_sql())
recipe.dataset.df
SELECT census.state AS state_raw,
       sum(census.pop2000) AS population_raw
FROM census
GROUP BY census.state
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-cc78427f7b0f> in <module>
     15 # Look at the output.
     16 print(recipe.to_sql())
---> 17 recipe.dataset.df

~/dev/py/recipe/recipe/core.py in dataset(self)
    558     @property
    559     def dataset(self):
--> 560         rows = self.all()
    561         if rows:
    562             first_row = rows[0]

~/dev/py/recipe/recipe/core.py in all(self)
    531
    532             self._all = self._cauldron.enchant(
--> 533                 self._query.all(), cache_context=self.cache_context
    534             )
    535             enchanttime = time.time()

~/dev/py/recipe/recipe/shelf.py in enchant(self, list, cache_context)
    654             # Iterate over the results and build a new namedtuple for each row
    655             for row in list:
--> 656                 values = row + tuple(fn(row) for fn in extra_callables)
    657                 enchantedlist.append(keyed_tuple(values))
    658

~/dev/py/recipe/recipe/shelf.py in <genexpr>(.0)
    654             # Iterate over the results and build a new namedtuple for each row
    655             for row in list:
--> 656                 values = row + tuple(fn(row) for fn in extra_callables)
    657                 enchantedlist.append(keyed_tuple(values))
    658

~/dev/py/recipe/recipe/ingredients.py in <lambda>(row)
    167             raw_property = self.id + '_raw'
    168             yield self.id, lambda row: \
--> 169                 self._format_value(getattr(row, raw_property))
    170
    171     def _order(self):

~/dev/py/recipe/recipe/ingredients.py in _format_value(self, value)
    118         """Formats value using any stored formatters. """
    119         for f in self.formatters:
--> 120             value = f(value)
    121         return value
    122

~/dev/py/recipe/recipe/utils.py in __call__(self, value)
    265     def __call__(self, value):
    266         self.fake.seed_instance(hash(value))
--> 267         value = self.formatter.format(self.format_str, fake=self.fake)
    268         if self.postprocessor is None:
    269             return value

/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in format(*args, **kwargs)
    184             raise TypeError("format() missing 1 required positional "
    185                             "argument: 'format_string'") from None
--> 186         return self.vformat(format_string, args, kwargs)
    187
    188     def vformat(self, format_string, args, kwargs):

/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in vformat(self, format_string, args, kwargs)
    188     def vformat(self, format_string, args, kwargs):
    189         used_args = set()
--> 190         result, _ = self._vformat(format_string, args, kwargs, used_args, 2)
    191         self.check_unused_args(used_args, args, kwargs)
    192         return result

/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in _vformat(self, format_string, args, kwargs, used_args, recursion_depth, auto_arg_index)
    228                 # given the field_name, find the object it references
    229                 #  and the argument it came from
--> 230                 obj, arg_used = self.get_field(field_name, args, kwargs)
    231                 used_args.add(arg_used)
    232

/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in get_field(self, field_name, args, kwargs)
    293         first, rest = _string.formatter_field_name_split(field_name)
    294
--> 295         obj = self.get_value(first, args, kwargs)
    296
    297         # loop through the rest of the field_name, doing

/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/string.py in get_value(self, key, args, kwargs)
    250             return args[key]
    251         else:
--> 252             return kwargs[key]
    253
    254

KeyError: 'first_name'

Using an anonymizer injects the anonymize function as the last formatter. The original value is available as ingredient_raw.