Decision Tree

H
p
I
"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""

import numpy as np


class DecisionTree:
    def __init__(self, depth=5, min_leaf_size=5):
        self.depth = depth
        self.decision_boundary = 0
        self.left = None
        self.right = None
        self.min_leaf_size = min_leaf_size
        self.prediction = None

    def mean_squared_error(self, labels, prediction):
        """
        mean_squared_error:
        @param labels: a one-dimensional numpy array
        @param prediction: a floating point value
        return value: mean_squared_error calculates the error if prediction is used to
            estimate the labels
        >>> tester = DecisionTree()
        >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
        >>> test_prediction = float(6)
        >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction)))
        True
        >>> test_labels = np.array([1,2,3])
        >>> test_prediction = float(2)
        >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction)))
        True
        """
        if labels.ndim != 1:
            print("Error: Input labels must be one dimensional")

        return np.mean((labels - prediction) ** 2)

    def train(self, x, y):
        """
        train:
        @param x: a one-dimensional numpy array
        @param y: a one-dimensional numpy array.
        The contents of y are the labels for the corresponding X values

        train() does not have a return value

        Examples:
        1. Try to train when x & y are of same length & 1 dimensions (No errors)
        >>> dt = DecisionTree()
        >>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))

        2. Try to train when x is 2 dimensions
        >>> dt = DecisionTree()
        >>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
        Traceback (most recent call last):
            ...
        ValueError: Input data set must be one-dimensional

        3. Try to train when x and y are not of the same length
        >>> dt = DecisionTree()
        >>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
        Traceback (most recent call last):
            ...
        ValueError: x and y have different lengths

        4. Try to train when x & y are of the same length but different dimensions
        >>> dt = DecisionTree()
        >>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
        Traceback (most recent call last):
            ...
        ValueError: Data set labels must be one-dimensional

        This section is to check that the inputs conform to our dimensionality
        constraints
        """
        if x.ndim != 1:
            raise ValueError("Input data set must be one-dimensional")
        if len(x) != len(y):
            raise ValueError("x and y have different lengths")
        if y.ndim != 1:
            raise ValueError("Data set labels must be one-dimensional")

        if len(x) < 2 * self.min_leaf_size:
            self.prediction = np.mean(y)
            return

        if self.depth == 1:
            self.prediction = np.mean(y)
            return

        best_split = 0
        min_error = self.mean_squared_error(x, np.mean(y)) * 2

        """
        loop over all possible splits for the decision tree. find the best split.
        if no split exists that is less than 2 * error for the entire array
        then the data set is not split and the average for the entire array is used as
        the predictor
        """
        for i in range(len(x)):
            if len(x[:i]) < self.min_leaf_size:  # noqa: SIM114
                continue
            elif len(x[i:]) < self.min_leaf_size:
                continue
            else:
                error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
                error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
                error = error_left + error_right
                if error < min_error:
                    best_split = i
                    min_error = error

        if best_split != 0:
            left_x = x[:best_split]
            left_y = y[:best_split]
            right_x = x[best_split:]
            right_y = y[best_split:]

            self.decision_boundary = x[best_split]
            self.left = DecisionTree(
                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
            )
            self.right = DecisionTree(
                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
            )
            self.left.train(left_x, left_y)
            self.right.train(right_x, right_y)
        else:
            self.prediction = np.mean(y)

        return

    def predict(self, x):
        """
        predict:
        @param x: a floating point value to predict the label of
        the prediction function works by recursively calling the predict function
        of the appropriate subtrees based on the tree's decision boundary
        """
        if self.prediction is not None:
            return self.prediction
        elif self.left or self.right is not None:
            if x >= self.decision_boundary:
                return self.right.predict(x)
            else:
                return self.left.predict(x)
        else:
            print("Error: Decision tree not yet trained")
            return None


class TestDecisionTree:
    """Decision Tres test class"""

    @staticmethod
    def helper_mean_squared_error_test(labels, prediction):
        """
        helper_mean_squared_error_test:
        @param labels: a one dimensional numpy array
        @param prediction: a floating point value
        return value: helper_mean_squared_error_test calculates the mean squared error
        """
        squared_error_sum = float(0)
        for label in labels:
            squared_error_sum += (label - prediction) ** 2

        return float(squared_error_sum / labels.size)


def main():
    """
    In this demonstration we're generating a sample data set from the sin function in
    numpy.  We then train a decision tree on the data set and use the decision tree to
    predict the label of 10 different test values. Then the mean squared error over
    this test is displayed.
    """
    x = np.arange(-1.0, 1.0, 0.005)
    y = np.sin(x)

    tree = DecisionTree(depth=10, min_leaf_size=10)
    tree.train(x, y)

    rng = np.random.default_rng()
    test_cases = (rng.random(10) * 2) - 1
    predictions = np.array([tree.predict(x) for x in test_cases])
    avg_error = np.mean((predictions - test_cases) ** 2)

    print("Test values: " + str(test_cases))
    print("Predictions: " + str(predictions))
    print("Average error: " + str(avg_error))


if __name__ == "__main__":
    main()
    import doctest

    doctest.testmod(name="mean_squarred_error", verbose=True)
이 알고리즘에 대해
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np


# In[ ]:


class D_TREE :
    
    def fit(self,Xin):
        #fitting the values
        self.X=Xin                            #training_dataset_
        self.my_tree=self.tree(Xin)           #calls tree() function to create a tree based on the dataset provided
    
    
    
    def label_count(self,t):
        #count the unique labels
        count = {}                           #a dictionary that will store the no of times every label has occurred
        for i in range(len(t)):
            lbl = t[i][-1]                   #The last field or column in t actually contains the labels 
            if lbl not in count:
                count[lbl] = 0               #If the label is not present previously,initialize it with zero
            count[lbl]+=1                    #Everytime a particular label is encountered its count is increased by 1           
        return count

    
    
    
    class Question :
        #stores the question and matches the question 
        def __init__(self,col,value):
            self.col = col                  #The column to which the question belongs to
            self.question = value           #the particualr cell in the column which is treated as question
        
        
        def is_digit_or_char(self,n):
            #checks whether a particular value is a number or not
            return isinstance(n,int) or isinstance(n,float)
    
        def check(self,row):
            value=row[self.col]              #the value to be tested with the question
            if(self.is_digit_or_char(self.question)):
                return value&gt;=self.question  #if the value is numeric in nature check whether it is greater or equal to question
            else :
                return value==self.question  #if the value is a character or string check whether it is equal to the question or not
         
        
   

    def gini(self,t):
        #Calculates the gini score
        label = np.unique(t)                #No of unique labels
        impurity = 1
    
        for i in range(len(label)):
            impurity -= (np.sum(t[:,-1]==label[i])/t.shape[0])**2    #formula for calculating impurity based on probability
    
        return impurity

    
    

    def information_gain(self,l,r,current_uncertainity):
        #Information gain is calculated
        p = len (l) / float ( len(l) + len(r) )             
        return current_uncertainity - p*self.gini(l) - (1-p)*self.gini(r)
    
    
    
    
    def best_split(self,t):
        #Selects the best question and split based on the gini score
        maxm=0
        best_question = None
        tr_row=[]
        fl_row=[]
            
        for i in range(t.shape[1]-1):
            y=np.unique(t[:,i])                         #no of unique labels in a particular column
            m=y.shape[0]                                #no of examples
            for j in range(m):
                question = self.Question(i,y[j])        #each unique label is considered a question one at a time
                tr_row , fl_row = self.split(t,question)#splits the rows based on the question
                if(len(fl_row)==0 or len(tr_row)==0):
                    continue                            #if any of the branch has zero rows,the question is skipped
                
                info_gain= self.information_gain(tr_row,fl_row,self.gini(t))  #information gain is calculated
                
                if(info_gain&gt;maxm):
                    """best question
                       with maximum informaion
                       gain is selected"""
                    maxm = info_gain                 
                    best_question = question
                
        return maxm,best_question

    
    
   
    def split(self,t,question)
    #Splits the dataset based on the best question
        tr_row=[]       
        fl_row=[]
        for k in range(t.shape[0]):
            """checks every row of the dataset 
               with the queston & if it matches,
               it is appended to the true rows
               else to the false rows"""
            if question.check(t[k]):
                tr_row=np.append(tr_row,t[k])   
            else:
                fl_row=np.append(fl_row,t[k])
                    
        tr_row = np.reshape(tr_row,(len(tr_row)//t.shape[1],t.shape[1]))   #just reshapes the one-d matrix into a readable 2d matrix
        fl_row = np.reshape(fl_row,(len(fl_row)//t.shape[1],t.shape[1]))   #just reshapes the one-d matrix into a readable 2d matrix
    
        return tr_row,fl_row
    
    
    
    
    
    class Decision_Node:
        #Stores the different question,true branch and false branch for all parts of the tree
        def __init__(self,question,true_branch,false_branch):
            self.question = question                        
            self.true_branch = true_branch
            self.false_branch = false_branch


            
            
    class Leaf:
        #the terminal of a tree is the leaf
        def __init__(self,t):
            self.predictions = D_TREE().label_count(t)    

            
            
            

    def tree(self,t):
        """the most important part of the entire algorithm
        this is where the tree is constructed from the root 
        to the leaves"""
        gain,question = self.best_split(t)                #best question with maximum gain is selected
        if(gain==0):
            return self.Leaf(t)                           #no gain indicates that leaf is reached
        
        """if the control has reached this far,it means
        there is useful gain and teh datset can be subdivided
        or branched into true rows and false rows"""
        true_rows , false_rows = self.split(t,question)    
        true_node = self.tree(true_rows)                  #A recursion is carried out till all the true rows are found out
        false_node= self.tree(false_rows)                 #after true rows,the false rows are assigned to the node in a reverse fashion
                                                            
        return self.Decision_Node(question,true_node,false_node)  
    
    
    
    
    
    def check_testing_data(self,test,node):
        #checks the testing data by recursively calling itself
        if isinstance(node,self.Leaf):
            return node.predictions        #when the leaf is reached prediction is made
        
        """a row is made to travel in the tree,till it reaches a leaf,
           it is checked with all decision nodes, and accordingly
           it travels along true branch or false branch,till
           it reaches a leaf"""
        if(node.question.check(test)):
            return self.check_testing_data(test,node.true_branch)
        else:
            return self.check_testing_data(test,node.false_branch)
        
    
    
    
    
    def print_leaf(self,LEAF):
        #prints a leaf
        p={}
        for i in LEAF.keys():
            p[i] = str(100*LEAF[i]/float(sum(LEAF.values()))) + "%"
        
        print(p)
        
    
    
    
    
    def pred(self,X_test):
        #predicts values for test data
        y_pred=[0]*X_test.shape[0]
        for i in range(X_test.shape[0]):
            """when a row reaches a particular leaf
               it is assigned the label which
               appears maximum in the leaf"""
            r= self.check_testing_data(X_test[i],self.my_tree)      #deals with one row at a time
            y_pred[i] = max(r.keys(), key=(lambda k: r[k]))         
        return y_pred
    
    
    
    
    
    def accuracy(self,y_test,y_pred):
        #Calculate the accuracy of the model
        return np.mean(y_test==y_pred)*100
    
    
    

#Importing libraries
import numpy as np

#Importing data set
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()


X=data['data']
Y=data['target']
Y=np.reshape(Y,(Y.shape[0],1))
X=X[:,:5]
X=np.append(X,Y,axis=1)
#Separating training set and test set
sz = len(X)//4
X_test=X[:sz,:]
X_train=X[sz:,:]   #X_train already contains Y_train as its last column so no need to define it separately
Y_test=X_test[:,-1]
n=X.shape[1]
#Training the algorithm
train=D_TREE()
train.fit(X_train)
#Predictions on test data
y_pred=train.pred(X_test)
#A few predictions
for i in range(10):
    train.print_leaf(train.check_testing_data(X_test[i],train.my_tree))
{1.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{1.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{1.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{0.0: &#x27;100.0%&#x27;}
{1.0: &#x27;100.0%&#x27;}
print("Accuracy of my model : ",train.accuracy(Y_test,y_pred),"%")
Accuracy of my model :  80.28169014084507 %
#calculating accuracy using sklearn model
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train[:,0:n-1], Y_train)
y_pred=clf.predict(X_test[:,:n-1])
error=(y_pred==Y_test.flatten())
print("accuracy of sklearn model = ",np.mean(error)*100,"%")
accuracy of sklearn model =  80.28169014084507 %