BAM! The blob after dct = {} (that stands for dictionary, dirty people) is the part I needed help with and then the rest I figured out from that.
import nltk
import pickle
import pandas
raw_data = {
'first_name': ['Bob', 'Teacher', 'Moiraine', 'Alastair', 'Knap', 'Matilda'],
'last_name': ['Zoogs', 'CD', 'Damodred', 'Damodred', 'Plc', 'Plc'],
'gender': ['Male','Male','Female','Male','Male','Female']
}
df = pandas.DataFrame(raw_data, columns = ['first_name', 'last_name', 'gender'])
last_name_dict = dict(iter(df.groupby("last_name")))
def gender_features(word):
return {'first_letter': word[0],
'last_letter': word[-1] }
dct = {}
for last in df.last_name.unique():
dct[last] = []
for first,gender in zip(last_name_dict[last].first_name,last_name_dict[last].gender):
dct[last].append((gender_features(first),gender))
# TRAIN in a loop
traindct = {}
class_dct = {}
for last in df.last_name.unique():
traindct['train_set_%s' % last] = dct[last][0:]
class_dct['Classif_%s'% last] = nltk.NaiveBayesClassifier.train(traindct['train_set_%s' % last])
# CLASSIFY Test:
Classif_Zoogs.classify(gender_features('Rob'))
>>> Male
Classif_Damodred.classify(gender_features('Elaine'))
>>> Female