This is the code to accompany the Lesson 3 (decision tree) mini-project.
Use a Decision Tree to identify emails from the Enron corpus by author:
Sara has label 0
Chris has label 1
import sys
import os
from time import time
sys.path.append("C:\\Users\\PR043\\OneDrive for Business\\Training\\Datacamp\\Python\\Udacity\\Machine Learning\\ud120-projects\\tools")
os.chdir("C:\\Users\\PR043\\OneDrive for Business\\Training\\Datacamp\\Python\\Udacity\\Machine Learning\\ud120-projects\\tools")
from email_preprocess import preprocess
from sklearn import tree
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()
#########################################################
### your code goes here ###
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
print('accuracy of the decision tree classifiation is: %0.3f' % clf.score(features_test, labels_test))
#########################################################
print 'number of features %d' % len(features_train[0])
go into ../tools/email_preprocess.py, and find the line of code that looks like this:
selector = SelectPercentile(f_classif, percentile=10)
Change percentile from 10 to 1, and rerun dt_author_id.py. What’s the number of features now?
import sys
import os
from time import time
sys.path.append("C:\\Users\\PR043\\OneDrive for Business\\Training\\Datacamp\\Python\\Udacity\\Machine Learning\\ud120-projects\\tools")
os.chdir("C:\\Users\\PR043\\OneDrive for Business\\Training\\Datacamp\\Python\\Udacity\\Machine Learning\\ud120-projects\\tools")
from email_preprocess import preprocess
features_train, features_test, labels_train, labels_test = preprocess()
print 'number of features %d' % len(features_train[0])
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
print('accuracy of the decision tree classifiation is: %0.3f' % clf.score(features_test, labels_test))