Generating Word Vectors Using FastText Blog-Post
Conversion of words in the vocabulary of the dataset to vectors using FastText.
import re
import string
import fasttext
import pandas as pd
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
nlp = English()
The dataset can be downloaded through:
Terminal:
gsutil cp gs://dataset-uploader/bbc/bbc-text.csv
OR
Visiting this website from the browser.
file = pd.read_csv("bbc-text.csv") # reading the dataset
df = file
df
df.drop(columns=["category"], inplace=True) # don't need the labels for the text
df
refined_string_list = []
token_list = []
filtered_list = []
for query in df["text"]:
# removing punctuations from string
string_translate = query.translate(
str.maketrans("", "", string.punctuation)
)
# initialise string to english langauge functions of spacy
spacy_doc = nlp(
string_translate
)
# appending empty list with tokenised string
for token in spacy_doc:
token_list.append(token.text)
# checking if tokenised word exists in a given list of stopwords obtained
for word in token_list:
lexeme = nlp.vocab[
word
]
if lexeme.is_stop == False:
filtered_list.append(
word
)
# converting list of tokenised words without stopwords to sentence
filtered_sentence = " ".join(
filtered_list
)
# removing multiple spaces from the string
filtered_sentence = re.sub(
" +", " ", filtered_sentence
)
# appending the list with strings without stop words
refined_string_list.append(
filtered_sentence
)
# reinitialising the lists
token_list = []
filtered_list = []
refined_string_list[0]
with open("refined-bbc-text.txt", "w") as f:
for item in refined_string_list:
f.write("%s\n" % item)
# <--TRAINING THE MODEL BASED ON FASTTEXT-->
print(fasttext.train_unsupervised.__doc__)
Default Configuration for parameters mentioned in [ ] for fasttext.train_unsupervised():
input # training file path (required)
model # unsupervised fasttext model {cbow, skipgram} [skipgram]
lr # learning rate [0.05]
dim # size of word vectors [100]
ws # size of the context window [5]
epoch # number of epochs [5]
minCount # minimal number of word occurences [5]
minn # min length of char ngram [3]
maxn # max length of char ngram [6]
neg # number of negatives sampled [5]
wordNgrams # max length of word ngram [1]
loss # loss function {ns, hs, softmax, ova} [ns]
bucket # number of buckets [2000000]
lrUpdateRate # change the rate of updates for the learning rate [100]
t # sampling threshold [0.0001]
verbose # verbose [2]
%%time
model = fasttext.train_unsupervised("refined-bbc-text.txt", dim=300, thread=4)
with open("tensorboard/metadata.tsv", "w") as f:
for item in model.words:
f.write(
"%s\n" % item
) # writing the vocabulary words of the model to a text file
model.save_model("fasttextmodel.bin") # saving the model