import sys, string, math, copy, time ### GLOBALS ### theFile = "cs170_NN_big_test_53.txt" datapoints = [] class Point: def __init__(self, group, features = []): self.group = group #which class does the point belong to self.features = features #list to keep features in #function to compute the Euclidean distance between two points #arguments: two points, list of features on which to compute distance def distance(p,q, feat): d = 0 pFeat = p.features qFeat = q.features for i in feat: d += (pFeat[i]-qFeat[i])**2 return math.sqrt(d) def processData(): #Get data points comprising training set from external file global theFile, datapoints datafile = open(theFile,'r') rawdata = datafile.readlines() datafile.close() #parse through the input for point in rawdata: temp = string.split(point) datapoints.append(temp) #switch our data from strings into useable numeric form numFields = len(datapoints[0]) for i in range(len(datapoints)): if (len(datapoints[i]) == numFields): #correct number fields group = int(string.atof(datapoints[i][0])) #first field is class temp = Point(group) temp.features = [] for j in range(1,numFields): temp.features.append(string.atof(datapoints[i][j])) datapoints[i] = temp else: datapoints.pop(i) #invalid data so discard print "\nThis dataset has", numFields-1, "features and", \ len(datapoints), "instances.\n" print "Please wait while I normalize the data...", #normalize the data minData = copy.copy(datapoints[0].features) #get range for each feature maxData = copy.copy(datapoints[0].features) for i in range(len(datapoints)): for j in range(len(datapoints[i].features)): if datapoints[i].features[j] < minData[j]: minData[j] = datapoints[i].features[j] if datapoints[i].features[j] > maxData[j]: maxData[j] = datapoints[i].features[j] #use range data for each feature to normalize the data for i in range(len(datapoints)): for j in range(len(datapoints[i].features)): datapoints[i].features[j] = (datapoints[i].features[j]-minData[j])/ \ (maxData[j]-minData[j]) print "Done!" def findNearest(p, set, feat): nearest = 0 #index of nearest neighbor dist = 1000 #distance to nearest neighbor for i in range(len(set)): temp = distance(p, set[i], feat) #if (p.group==1): print set[i].group, temp if (temp < dist) and (temp != 0): nearest = i dist = temp return set[nearest] def findNearFast(p, set, feat, delta): #find a "pretty-good" nearest neighbor nearest = 0 #index of nearest neighbor dist = 1000 #distance to nearest neighbor for i in range(len(set)): temp = distance(p, set[i], feat) if (temp < delta) and (temp != 0): return set[i] if (temp < dist) and (temp != 0): nearest = i dist = temp return set[nearest] def testFeatureSet(set): global datapoints records = len(datapoints) correct = 0.0 for p in datapoints: neighbor = findNearest(p, datapoints, set) if neighbor.group == p.group: correct += 1 return 100*(correct/records) def testSetFast(set, delta): global datapoints records = len(datapoints) correct = 0.0 for p in datapoints: neighbor = findNearFast(p, datapoints, set, delta) if neighbor.group == p.group: correct += 1 return 100*(correct/records) def selectFeatures(choice): global datapoints records = len(datapoints) correct = 0.0 if (choice == 4): while 1: set = raw_input("Enter the list of features to use.\nUse space" + \ " or tab between numbers: ") set = string.split(set) for feat in range(len(set)): set[feat] = string.atoi(set[feat]) accuracy = testFeatureSet(set) print "Running nearest neighbor with feature set:", set, \ 'using "leave-one-out" evaluation, yields an accuracy of',\ accuracy, "%.\n" return feat = range(len(datapoints[0].features)) for p in datapoints: neighbor = findNearest(p, datapoints, feat) if neighbor.group == p.group: correct += 1 accuracy = 100*(correct/records) print "Running nearest neighbor with all", len(datapoints[0].features), \ 'features, using "leave-one-out" evaluation\nyields an accuracy of',\ accuracy, "%.\n" bestSet = [] bestAcc = 0.0 if (choice == 1): feat = [] print "Beginning Forward Selection Search." startTime = time.time() for i in range(len(datapoints[0].features)): max = 0 #store index of best feature tempAcc = 0.0 for j in range(len(datapoints[0].features)): if j not in feat: temp = feat + [j] accuracy = testFeatureSet(temp) print "\tUsing feature(s)", temp, "accuracy is", accuracy, "%" if (accuracy > tempAcc): max = j tempAcc = accuracy if tempAcc > bestAcc: bestSet = temp bestAcc = tempAcc feat.append(max) feat.sort() print "Feature set", feat, "was best with accuracy", tempAcc, "%" t = time.time() - startTime bestSet.sort() print "\nFinished search. The best subset is", bestSet, \ "which has an accuracy\nof", bestAcc, "%.\n" print "Search took", t, "seconds." elif (choice == 2): feat = range(len(datapoints[0].features)) print "Beginning Backward Elimination Search." startTime = time.time() for i in range(len(datapoints[0].features)): max = 0 #store index of feature to be removed tempAcc = 0.0 for j in range(len(datapoints[0].features)): if j in feat: temp = copy.copy(feat) temp.remove(j) accuracy = testFeatureSet(temp) print "\tUsing feature(s)", temp, "accuracy is", accuracy, "%" if (accuracy > tempAcc): max = j tempAcc = accuracy if tempAcc > bestAcc: bestSet = temp bestAcc = tempAcc feat.remove(max) print "Feature set", feat, "was best with accuracy", tempAcc, "%" t = time.time() - startTime bestSet.sort() print "\nFinished search. The best subset is", bestSet, \ "which has an accuracy\nof", bestAcc, "%.\n" print "Search took", t, "seconds." elif (choice == 3): feat = [] print '"Pretty Good" Search uses a delta value to find nearest\n' + \ 'neighbors fast. The larger the value, the faster the\n' + \ 'search goes, with less accuracy. Enter a value between 0 and 2.' delta = float(raw_input("delta: ")) if (delta < 0) or (delta > 2): print "Invalid entry. Goodbye.\n" sys.exit() startTime = time.time() print 'Beginning "Pretty Good" Search.' for i in range(len(datapoints[0].features)): max = 0 #store index of best feature tempAcc = 0.0 for j in range(len(datapoints[0].features)): if j not in feat: temp = feat + [j] accuracy = testSetFast(temp, delta) print "\tUsing feature(s)", temp if (accuracy > tempAcc): max = j tempAcc = accuracy if tempAcc > bestAcc: bestSet = temp bestAcc = tempAcc feat.append(max) feat.sort() print "Feature set", feat, "was best with accuracy", tempAcc, "%" bestSet.sort() accuracy = testFeatureSet(bestSet) t = time.time() - startTime print "\nFinished search. A pretty good subset is", bestSet, \ "which has an accuracy\nof", accuracy, "%.\n" print "Search took", t, "seconds." def main(): global theFile print "Welcome to Brandon Sterne's Feature Selection Algorithm." print "Using", theFile, "for test data.\n" print "\tEnter your choice of algorithm" print "\t 1. Forward Selection" print "\t 2. Backward Elimination" print "\t 3. Brandon's Special Algorithm" print "\t 4. Test a Feature Set" try: choice = int(raw_input("\t Choice: ")) except: print "Invalid choice. Goodbye." sys.exit() if choice < 1 or choice > 4: print "Invalid choice. Goodbye." sys.exit() #process data from file processData() #start feature selection process selectFeatures(choice) if __name__ == "__main__": main()