--- title: Title keywords: fastai sidebar: home_sidebar ---
{% raw %}
{% endraw %} {% raw %}
import multiprocessing
import pandas as pd
import spacy
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from tqdm.auto import tqdm
{% endraw %} {% raw %}
text = [
    f"{i} I went to the park, yesterday, wasn't here after school? Today. --2"
    for i in range(50_000)
]
{% endraw %} {% raw %}
df = pd.DataFrame(text, columns=['a'])
df.head()
a
0 0 I went to the park, yesterday, wasn't here a...
1 1 I went to the park, yesterday, wasn't here a...
2 2 I went to the park, yesterday, wasn't here a...
3 3 I went to the park, yesterday, wasn't here a...
4 4 I went to the park, yesterday, wasn't here a...
{% endraw %} {% raw %}
nlp = spacy.load('en_core_web_sm')
nlp
<spacy.lang.en.English at 0x12a251a58>
{% endraw %} {% raw %}
# ~400 seconds
x = df.a.apply(nlp)
{% endraw %} {% raw %}
nlp = spacy.load('en_core_web_sm',
                 disable=['ner', 'tagger', 'parser'])
{% endraw %} {% raw %}
# ~9 seconds after disabling extra parsers.
df.a.apply(nlp)
0        (0, I, went, to, the, park, ,, yesterday, ,, w...
1        (1, I, went, to, the, park, ,, yesterday, ,, w...
2        (2, I, went, to, the, park, ,, yesterday, ,, w...
3        (3, I, went, to, the, park, ,, yesterday, ,, w...
4        (4, I, went, to, the, park, ,, yesterday, ,, w...
                               ...                        
49995    (49995, I, went, to, the, park, ,, yesterday, ...
49996    (49996, I, went, to, the, park, ,, yesterday, ...
49997    (49997, I, went, to, the, park, ,, yesterday, ...
49998    (49998, I, went, to, the, park, ,, yesterday, ...
49999    (49999, I, went, to, the, park, ,, yesterday, ...
Name: a, Length: 50000, dtype: object
{% endraw %} {% raw %}
def tokenize(x):
    return [t.text for t in nlp(x)]
{% endraw %} {% raw %}
# ~9 seconds with text extraction (unchanged)
df.a.apply(tokenize)
0        [0, I, went, to, the, park, ,, yesterday, ,, w...
1        [1, I, went, to, the, park, ,, yesterday, ,, w...
2        [2, I, went, to, the, park, ,, yesterday, ,, w...
3        [3, I, went, to, the, park, ,, yesterday, ,, w...
4        [4, I, went, to, the, park, ,, yesterday, ,, w...
                               ...                        
49995    [49995, I, went, to, the, park, ,, yesterday, ...
49996    [49996, I, went, to, the, park, ,, yesterday, ...
49997    [49997, I, went, to, the, park, ,, yesterday, ...
49998    [49998, I, went, to, the, park, ,, yesterday, ...
49999    [49999, I, went, to, the, park, ,, yesterday, ...
Name: a, Length: 50000, dtype: object
{% endraw %} {% raw %}
# ~5 seconds with multiprocessing
with multiprocessing.Pool(5) as p:
    x = p.map(tokenize, df.a)
{% endraw %} {% raw %}
# ~5 seconds as long as chunksize is large
with multiprocessing.Pool() as p:
    y = list(p.imap(tokenize, df.a, chunksize=1_000))
{% endraw %} {% raw %}
from concurrent import futures
{% endraw %} {% raw %}
# ~10 seconds
with futures.ThreadPoolExecutor(max_workers=4) as ex:
    res = list(ex.map(tokenize, df.a))
{% endraw %} {% raw %}
# ~12 seconds - Fluent Python book code
res = []
with futures.ThreadPoolExecutor(max_workers=4) as executor:
    to_do_map = {}
    for text in df.a.values:
        future = executor.submit(tokenize, text)
        to_do_map[future] = text
        done_iter = futures.as_completed(to_do_map)
    done_iter = tqdm(done_iter, total=df.shape[0])
    for future in done_iter:
        res.append(future.result())

{% endraw %}

Trying to figure out diff between nlp() and separate tokenizer

Followup: A stackoverflow commenter claims this is just splitting on white space because we haven't defined any rules. But isn't that what the English() object is doing?

{% raw %}
eng = English()
tok = Tokenizer(eng.vocab)
{% endraw %} {% raw %}
x1 = df.a.head(100).apply(tokenize)
{% endraw %} {% raw %}
def tokenize_tok(text):
    return [t.text for t in tok(text)]
{% endraw %} {% raw %}
x2 = df.a.head(100).apply(tokenize_tok)
{% endraw %} {% raw %}
[(k1, k2) for k1, k2 in zip(x1[0], x2[0]) if k1 != k2]
[('park', 'park,'),
 (',', 'yesterday,'),
 ('yesterday', "wasn't"),
 (',', 'here'),
 ('was', 'after'),
 ("n't", 'school?'),
 ('here', 'Today.'),
 ('after', '--2')]
{% endraw %} {% raw %}
list(zip(x1[0], x2[0]))
[('0', '0'),
 ('I', 'I'),
 ('went', 'went'),
 ('to', 'to'),
 ('the', 'the'),
 ('park', 'park,'),
 (',', 'yesterday,'),
 ('yesterday', "wasn't"),
 (',', 'here'),
 ('was', 'after'),
 ("n't", 'school?'),
 ('here', 'Today.'),
 ('after', '--2')]
{% endraw %}