In [2]:
# Esempi di codice presi dal Capitolo 6 (Learning to Classify Text) del 
# manule di NLTK http://www.nltk.org/book/
# Questo notebook, apprende come classificare un nome proprio di persona 
# in base al sesso
import nltk
from nltk.corpus import names

In [3]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [4]:
gender_features('Shrek')

{'last_letter': 'k'}

In [5]:
from nltk.corpus import names

In [6]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                  [(name, 'female') for name in names.words('female.txt')])

In [7]:
import random
random.shuffle(labeled_names)

In [8]:
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]

In [9]:
train_set = featuresets[500:]
test_set = featuresets[:500]

In [10]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [11]:
classifier.classify(gender_features('Neo'))

'male'

In [12]:
classifier.classify(gender_features('Trinity'))

'female'

In [13]:
print("la percentuale di accuratezza del classificatore bayesiano di NLTK: ", 
         nltk.classify.accuracy(classifier, test_set)*100, "%")

la percentuale di accuratezza del classificatore bayesiano di NLTK:  73.2 %


In [14]:
classifier.show_most_informative_features(26)

Most Informative Features
             last_letter = 'a'            female : male   =     36.8 : 1.0
             last_letter = 'k'              male : female =     32.3 : 1.0
             last_letter = 'f'              male : female =     15.3 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      8.7 : 1.0
             last_letter = 'm'              male : female =      8.3 : 1.0
             last_letter = 'r'              male : female =      6.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'g'              male : female =      4.9 : 1.0
             last_letter = 't'              male : female =      4.3 : 1.0
             last_letter = 's'              male : female =      4.2 : 1.0

In [15]:
classifier.classify(gender_features('Helen'))

'male'

In [None]:
# ATTENZIONE! ESAGERARE CON LA LISTA DELLE FEATURES NON SEMPRE HA DEI VANTAGGI

In [29]:
# ATTENZIONE! useremo di seguito il metodo "format" delle stringhe:
# "str.format() method"
# vediamo prima qualche esempio di come si usa questo metodo di formattazione

# using format option in a simple string 
print ("{}, A computer science portal for geeks."
                        .format("GeeksforGeeks")) 
  
# using format option for a 
# value stored in a variable 
str = "This article is written in {}"
print (str.format("Python")) 
  
# formatting a string using a numeric constant 
print ("Hello, I am {} years old !".format(18))

GeeksforGeeks, A computer science portal for geeks.
This article is written in Python
Hello, I am 18 years old !


In [16]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [19]:
gender_features2('Jhon')

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [20]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

In [30]:
train_set = featuresets[500:]

In [31]:
test_set = featuresets[:500]

In [32]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [33]:
print(nltk.classify.accuracy(classifier, test_set))

0.81


In [34]:
classifier.show_most_informative_features(50)

Most Informative Features
             last_letter = 'a'            female : male   =     36.8 : 1.0
             last_letter = 'k'              male : female =     32.3 : 1.0
             last_letter = 'f'              male : female =     15.3 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      8.7 : 1.0
                count(v) = 2              female : male   =      8.4 : 1.0
             last_letter = 'm'              male : female =      8.3 : 1.0
             last_letter = 'r'              male : female =      6.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'g'              male : female =      4.9 : 1.0
                count(a) = 3              female : male   =      4.6 : 1.0

In [26]:
classifier.classify(gender_features2('Helen'))

'male'

In [None]:
# PIUUTOSTO CHE UNA LUNGA LISTA DI FEATURES, MEGLIO UNA 
# SOLUZIONE CON LISTA DI DEVIAZIONE DELL'ERRORE

In [35]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [36]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [38]:
# OTTENGO UNA ACCURATEZZA "INTERMEDIA" RISPETTO ALLE DUE SOLUZIONI PRECEDENTI
print(nltk.classify.accuracy(classifier, devtest_set))

0.777


In [39]:
# Using the dev-test set, we can generate a list of the errors 
# that the classifier makes when predicting name genders:

errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [46]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))
# ATTEZNIONE! lo notazione "<8, <8s, <30" serve solo a creare spazio 
# nella formattazione delle colonne della tabella che viene stampata

correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Amber                         
correct=female   guess=male     name=Angil                         
correct=female   guess=male     name=Annabell                      
correct=female   guess=male     name=Arabel                        
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Arlyn                         
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Averil                        
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Brier      

In [47]:
# soluzione senza spazio tra le colonne 
for (tag, guess, name) in sorted(errors):
    print('correct={:} guess={:} name={:}'.format(tag, guess, name))

correct=female guess=male name=Adel
correct=female guess=male name=Aidan
correct=female guess=male name=Allsun
correct=female guess=male name=Allys
correct=female guess=male name=Allyson
correct=female guess=male name=Amber
correct=female guess=male name=Angil
correct=female guess=male name=Annabell
correct=female guess=male name=Arabel
correct=female guess=male name=Ariel
correct=female guess=male name=Arlyn
correct=female guess=male name=Astrid
correct=female guess=male name=Averil
correct=female guess=male name=Bab
correct=female guess=male name=Brier
correct=female guess=male name=Carin
correct=female guess=male name=Carlyn
correct=female guess=male name=Caro
correct=female guess=male name=Carolan
correct=female guess=male name=Carolann
correct=female guess=male name=Cass
correct=female guess=male name=Chantal
correct=female guess=male name=Charmion
correct=female guess=male name=Chris
correct=female guess=male name=Christen
correct=female guess=male name=Chrystal
correct=female gu

In [51]:
# quindi piuttosto che considerare l'ultima lettera o prendere in considerazione 
# tutte le lettere generando solo rumore, guardando la lista della divergenza 
# degli errori, puÃ² essere utile guardare alle due ultime lettere
# For example, names ending in 'yn' appear to be predominantly female, 
# despite the fact that names ending in n tend to be male; 
# moreover, names ending in 'ch' are usually male
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [52]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.793
