DiffPriv.diff

View Source

from . import np
from . import random as rd
from . import math

def randresponse(response_list):
    """
    Random uses the Random Response mechanism. Random Response is a simple differential privacy algorithm. To use random pass one parameter:

    

        random(response_list)

    ## Parameters

    - response_list
        List with data.
    """

    for i in response_list:
        b = np.random.randint(2)
        if b == 0: # pragma: no cover
            response_list[i] = 0

        if b == 1:
            b1 = np.random.randint(2)

            if b1 == 0: # pragma: no cover
                response_list[i] = 0

            if b1 == 1: # pragma: no cover
                response_list[i] = 1

    del response_list

def lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None):
    """
    Preforms the laplace mechanism.

    

        lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None)
    
    ## Parameters

    - Data
        This is quite simple. You should pass in something like this:
            open('data.csv', 'r')
        Make sure that it has reading permissions.

    - file_name
        This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.

    - epsilon
        This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!

    - f
        f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!

    
    ## Optional Parameters

    - delta_f
        delta_f is the sensitivity of f. Remeber that this is an optional parameter

    - sample_size
        sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.
    """
    new_data = open(file_name, 'w+')
    raw_data = data.readlines()

    rows = len(raw_data)
    sample_size = min(rows, sample_size)
    columns = len(raw_data[0].split(','))

    if delta_f is None:

        delta_f = []
        data_draft = []

        for c in range(columns):

            data_draft.append([float(raw_data[r].split(',')[c]) for r in range(rows)])
            samples = [int(rows * rd.random()) for i in range(sample_size)]
            delta_f.append(0.01)

            for i in samples:

                x_prime = [x for x in data_draft[-1]]
                x_prime[i] += 1
                delta_f[-1] = max(delta_f[-1], abs(f(data_draft[-1])-f(x_prime)))

    for r in range(rows):

        line = []
        for c in range(columns):

            b = delta_f[c]/epsilon
            coin_flip = round(rd.random())
            if coin_flip == 0: line.append(float(raw_data[r].split(',')[c]) + rd.expovariate(1 / (2 * b)))
            if coin_flip == 1: line.append(float(raw_data[r].split(',')[c]) - rd.expovariate(1 / (2 * b)))

        new_data.writelines(str(line)+'\n')

    del data
    del file_name
    del epsilon
    del f
    del sample_size
    del delta_f

    return new_data

def expmech(data, file_name, epsilon, u, r, sample_size=10, delta_u=None):
    '''
    Preforms the exponential mechanism.

    

        expmech(data, file_name, epsilon, u, r, sample_size=10, delta_f=None)
    
    ## Parameters

    - Data
        This is quite simple. You should pass in something like this:
            open('data.csv', 'r')
        Make sure that it has reading permissions.

    - file_name
        This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.

    - epsilon
        This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!

    - f
        f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!

    - r
        r is any valid python range. You could pass in:
            range(0, 10)
    
    ## Optional Parameters

    - delta_f
        delta_f is the sensitivity of f. Remeber that this is an optional parameter

    - sample_size
        sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.
    '''
    new_data = open(file_name, 'w+')
    raw_data = data.readlines()

    rows = len(raw_data)
    sample_size = min(rows, sample_size)
    columns = len(raw_data[0].split(','))

    if delta_u is None:

        delta_u = []
        data_draft = []

        for c in range(columns):

            data_draft.append([float(raw_data[ro].split(',')[c]) for ro in range(rows)])
            samples = [int(rows * rd.random()) for i in range(sample_size)]
            delta_u.append(0.01)

            for i in samples:

                for k in r:

                    x_prime = [x for x in data_draft[-1]]
                    x_prime[i] += 1
                    delta_u[-1] = max(delta_u[-1], abs(u(data_draft[-1], k)-u(x_prime, k)))

    line = []

    for c in range(columns):

        data = [float(raw_data[ro].split(',')[c]) for ro in range(rows)]
        probabilities = [math.exp(epsilon*u(data, k)/(2*delta_u[c])) for k in r]
        rand = sum(probabilities)*rd.random()

        r_choice = 0
        running_sum = 0

        for choice in r:

            if rand <= running_sum + probabilities[choice]:
                r_choice = choice
                break

            else:
                running_sum += probabilities[choice]

        line.append(r_choice)

    new_data.writelines(str(line)+'\n')

    del data
    del file_name
    del epsilon
    del u
    del r
    del sample_size
    del delta_u


    return new_data

# def randresponse(response_list):

View Source

def randresponse(response_list):
    """
    Random uses the Random Response mechanism. Random Response is a simple differential privacy algorithm. To use random pass one parameter:

    

        random(response_list)

    ## Parameters

    - response_list
        List with data.
    """

    for i in response_list:
        b = np.random.randint(2)
        if b == 0: # pragma: no cover
            response_list[i] = 0

        if b == 1:
            b1 = np.random.randint(2)

            if b1 == 0: # pragma: no cover
                response_list[i] = 0

            if b1 == 1: # pragma: no cover
                response_list[i] = 1

    del response_list

Random uses the Random Response mechanism. Random Response is a simple differential privacy algorithm. To use random pass one parameter:

random(response_list)

Parameters

response_list List with data.

# def lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None):

View Source

def lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None):
    """
    Preforms the laplace mechanism.

    

        lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None)
    
    ## Parameters

    - Data
        This is quite simple. You should pass in something like this:
            open('data.csv', 'r')
        Make sure that it has reading permissions.

    - file_name
        This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.

    - epsilon
        This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!

    - f
        f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!

    
    ## Optional Parameters

    - delta_f
        delta_f is the sensitivity of f. Remeber that this is an optional parameter

    - sample_size
        sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.
    """
    new_data = open(file_name, 'w+')
    raw_data = data.readlines()

    rows = len(raw_data)
    sample_size = min(rows, sample_size)
    columns = len(raw_data[0].split(','))

    if delta_f is None:

        delta_f = []
        data_draft = []

        for c in range(columns):

            data_draft.append([float(raw_data[r].split(',')[c]) for r in range(rows)])
            samples = [int(rows * rd.random()) for i in range(sample_size)]
            delta_f.append(0.01)

            for i in samples:

                x_prime = [x for x in data_draft[-1]]
                x_prime[i] += 1
                delta_f[-1] = max(delta_f[-1], abs(f(data_draft[-1])-f(x_prime)))

    for r in range(rows):

        line = []
        for c in range(columns):

            b = delta_f[c]/epsilon
            coin_flip = round(rd.random())
            if coin_flip == 0: line.append(float(raw_data[r].split(',')[c]) + rd.expovariate(1 / (2 * b)))
            if coin_flip == 1: line.append(float(raw_data[r].split(',')[c]) - rd.expovariate(1 / (2 * b)))

        new_data.writelines(str(line)+'\n')

    del data
    del file_name
    del epsilon
    del f
    del sample_size
    del delta_f

    return new_data

Preforms the laplace mechanism.

lapmech(data, file_name, epsilon, f, sample_size=10, delta_f=None)

Parameters

Data This is quite simple. You should pass in something like this: open('data.csv', 'r') Make sure that it has reading permissions.
file_name This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.
epsilon This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!
f f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!

Optional Parameters

delta_f delta_f is the sensitivity of f. Remeber that this is an optional parameter
sample_size sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.

# def expmech(data, file_name, epsilon, u, r, sample_size=10, delta_u=None):

View Source

def expmech(data, file_name, epsilon, u, r, sample_size=10, delta_u=None):
    '''
    Preforms the exponential mechanism.

    

        expmech(data, file_name, epsilon, u, r, sample_size=10, delta_f=None)
    
    ## Parameters

    - Data
        This is quite simple. You should pass in something like this:
            open('data.csv', 'r')
        Make sure that it has reading permissions.

    - file_name
        This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.

    - epsilon
        This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!

    - f
        f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!

    - r
        r is any valid python range. You could pass in:
            range(0, 10)
    
    ## Optional Parameters

    - delta_f
        delta_f is the sensitivity of f. Remeber that this is an optional parameter

    - sample_size
        sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.
    '''
    new_data = open(file_name, 'w+')
    raw_data = data.readlines()

    rows = len(raw_data)
    sample_size = min(rows, sample_size)
    columns = len(raw_data[0].split(','))

    if delta_u is None:

        delta_u = []
        data_draft = []

        for c in range(columns):

            data_draft.append([float(raw_data[ro].split(',')[c]) for ro in range(rows)])
            samples = [int(rows * rd.random()) for i in range(sample_size)]
            delta_u.append(0.01)

            for i in samples:

                for k in r:

                    x_prime = [x for x in data_draft[-1]]
                    x_prime[i] += 1
                    delta_u[-1] = max(delta_u[-1], abs(u(data_draft[-1], k)-u(x_prime, k)))

    line = []

    for c in range(columns):

        data = [float(raw_data[ro].split(',')[c]) for ro in range(rows)]
        probabilities = [math.exp(epsilon*u(data, k)/(2*delta_u[c])) for k in r]
        rand = sum(probabilities)*rd.random()

        r_choice = 0
        running_sum = 0

        for choice in r:

            if rand <= running_sum + probabilities[choice]:
                r_choice = choice
                break

            else:
                running_sum += probabilities[choice]

        line.append(r_choice)

    new_data.writelines(str(line)+'\n')

    del data
    del file_name
    del epsilon
    del u
    del r
    del sample_size
    del delta_u


    return new_data

Preforms the exponential mechanism.

expmech(data, file_name, epsilon, u, r, sample_size=10, delta_f=None)

Parameters

Data This is quite simple. You should pass in something like this: open('data.csv', 'r') Make sure that it has reading permissions.
file_name This is the name of the file that the privatized data will go into. It doesn’t have to be created for this to work.
epsilon This is one of the most important parameters in the entire algorithm. Unlike the random response mechanism, epsilon allows us to quantify privacy loss. This number should be a positive float like 3.14 or 2.71. Play around with different values to find which one works best but remember that smaller values will cause significant data changes while larger values may compromise privacy. Find the value that best works for your survey!
f f should be a function that takes in an array (all the elements in a certain column of the data) and produce an output (like the average, standard deviation, etc.). This should be a valid python function!
r r is any valid python range. You could pass in: range(0, 10)

Optional Parameters

delta_f delta_f is the sensitivity of f. Remeber that this is an optional parameter
sample_size sample_size is also optional. It gives the program an idea of what delta_f should be. It will default to 10 if left empty.