Source code for nnsa.edfreadpy.anonymization.anonymization

"""
Module for anonymization of data files/records.
"""

import datetime
import os
import random

from nnsa.edfreadpy.anonymization.config import MIN_DAYS_TO_SHIFT, MAX_DAYS_TO_SHIFT

__all__ = [
    'compute_anonymized_date',
    'compute_days_to_shift',
    'extract_patient_file_id'
]


[docs]def compute_anonymized_date(date, id, seed_offset):
    """
    Return an anonymized date based on original date and ID (e.g. a patient file ID) by adding a random
    number of days, using the specified ID as a seed, such that the same number of days will be added from
    dates with the same id.

    Args:
        date (datetime.date): date to anonymize.
        id (str): string that is used to seed the random generator.
        seed_offset (int): offset to the seed that is determined by `id`.

    Returns:
        anonymized_date (datetime.date): random (anonymized) date.
    """
    # Compute a random number of days to shift the date (but seed the random generator with id).
    days_to_shift = compute_days_to_shift(id, seed_offset)

    # Add random number of days to original date.
    anonymized_date = date + datetime.timedelta(days=days_to_shift)

    return anonymized_date


[docs]def compute_days_to_shift(id, seed_offset):
    """
    Compute the number of days to shift a date (e.g. by adding) when the dates needs to be converted to its anonymized
    date. The number of days is chosen randomly, while first seeding the random generator using the specified id, such
    that this function returns the same output when called with the same id.

    Args:
        id (str): id that determines the anonymization (runs with same id will return equal numbers).

    Returns:
        (int): random number (seeded).
    """
    # Create seed value from ID by concatenating the integers representing Unicode code points of the characters in ID.
    seed = int(''.join(str(ord(c)) for c in id)) - 43 + seed_offset
    random.seed(seed)

    # Generate a random number of days.
    return random.randint(round(MIN_DAYS_TO_SHIFT), round(MAX_DAYS_TO_SHIFT))


[docs]def extract_patient_file_id(filename, base_id=None):
    """
    Takes a file name and extracts the patient file id from it, assuming some general structuring of the filenames.
    E.g. extract_patient_file_id('EEG43a_1') returns 'EEG43'

    Args:
        filename (str): filename. May or may not include directory path and/or extension.
        base_id (str, optional): Optionally specify a base for the patient id. The extracted patient file id will
            contain at least this base id. Raises an error if base id cannot be identified in the filename. Not
            case sensitive.

    Returns:
         (str): the id specifying the patient (e.g. EEG43).
    """
    # Remove directory from filename (if it includes one).
    filename = os.path.split(filename)[1]

    # Remove extension from filename (if it has an extension).
    filename = filename.split('.')[0]

    # Extract base id.
    if base_id:
        if base_id.lower() in filename.lower():
            base_id = base_id.lower()
            filename = filename[len(base_id):]
        else:
            raise ValueError('Specified base ID "{}" not found in filename "{}".'.format(base_id, filename))
    else:
        base_id = ''

    # If the last character is a lower alphabetical character, delete it (assuming that is specifies the measurement #).
    patient_file_id = filename[:-1] if filename[-1].islower() else filename

    # Make ID case insensitive (convert upper case characters to lower case) and remove white spaces.
    patient_file_id = ''.join(patient_file_id.lower().split())

    # Take only the part before the last underscore (assuming the underscore is used to specify the measurement #).
    split_parts = patient_file_id.split('_')
    patient_file_id = '_'.join(split_parts[:-1]) if len(split_parts) > 1 else patient_file_id

    return base_id + patient_file_id