--- title: Title keywords: fastai sidebar: home_sidebar ---
import multiprocessing
import pandas as pd
import spacy
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from tqdm.auto import tqdm
text = [
f"{i} I went to the park, yesterday, wasn't here after school? Today. --2"
for i in range(50_000)
]
df = pd.DataFrame(text, columns=['a'])
df.head()
nlp = spacy.load('en_core_web_sm')
nlp
# ~400 seconds
x = df.a.apply(nlp)
nlp = spacy.load('en_core_web_sm',
disable=['ner', 'tagger', 'parser'])
# ~9 seconds after disabling extra parsers.
df.a.apply(nlp)
def tokenize(x):
return [t.text for t in nlp(x)]
# ~9 seconds with text extraction (unchanged)
df.a.apply(tokenize)
# ~5 seconds with multiprocessing
with multiprocessing.Pool(5) as p:
x = p.map(tokenize, df.a)
# ~5 seconds as long as chunksize is large
with multiprocessing.Pool() as p:
y = list(p.imap(tokenize, df.a, chunksize=1_000))
from concurrent import futures
# ~10 seconds
with futures.ThreadPoolExecutor(max_workers=4) as ex:
res = list(ex.map(tokenize, df.a))
# ~12 seconds - Fluent Python book code
res = []
with futures.ThreadPoolExecutor(max_workers=4) as executor:
to_do_map = {}
for text in df.a.values:
future = executor.submit(tokenize, text)
to_do_map[future] = text
done_iter = futures.as_completed(to_do_map)
done_iter = tqdm(done_iter, total=df.shape[0])
for future in done_iter:
res.append(future.result())
eng = English()
tok = Tokenizer(eng.vocab)
x1 = df.a.head(100).apply(tokenize)
def tokenize_tok(text):
return [t.text for t in tok(text)]
x2 = df.a.head(100).apply(tokenize_tok)
[(k1, k2) for k1, k2 in zip(x1[0], x2[0]) if k1 != k2]
list(zip(x1[0], x2[0]))