Source code for nnsa.edfreadpy.anonymization.anonymization
"""
Module for anonymization of data files/records.
"""
import datetime
import os
import random
from nnsa.edfreadpy.anonymization.config import MIN_DAYS_TO_SHIFT, MAX_DAYS_TO_SHIFT
__all__ = [
'compute_anonymized_date',
'compute_days_to_shift',
'extract_patient_file_id'
]
[docs]def compute_anonymized_date(date, id, seed_offset):
"""
Return an anonymized date based on original date and ID (e.g. a patient file ID) by adding a random
number of days, using the specified ID as a seed, such that the same number of days will be added from
dates with the same id.
Args:
date (datetime.date): date to anonymize.
id (str): string that is used to seed the random generator.
seed_offset (int): offset to the seed that is determined by `id`.
Returns:
anonymized_date (datetime.date): random (anonymized) date.
"""
# Compute a random number of days to shift the date (but seed the random generator with id).
days_to_shift = compute_days_to_shift(id, seed_offset)
# Add random number of days to original date.
anonymized_date = date + datetime.timedelta(days=days_to_shift)
return anonymized_date
[docs]def compute_days_to_shift(id, seed_offset):
"""
Compute the number of days to shift a date (e.g. by adding) when the dates needs to be converted to its anonymized
date. The number of days is chosen randomly, while first seeding the random generator using the specified id, such
that this function returns the same output when called with the same id.
Args:
id (str): id that determines the anonymization (runs with same id will return equal numbers).
Returns:
(int): random number (seeded).
"""
# Create seed value from ID by concatenating the integers representing Unicode code points of the characters in ID.
seed = int(''.join(str(ord(c)) for c in id)) - 43 + seed_offset
random.seed(seed)
# Generate a random number of days.
return random.randint(round(MIN_DAYS_TO_SHIFT), round(MAX_DAYS_TO_SHIFT))
[docs]def extract_patient_file_id(filename, base_id=None):
"""
Takes a file name and extracts the patient file id from it, assuming some general structuring of the filenames.
E.g. extract_patient_file_id('EEG43a_1') returns 'EEG43'
Args:
filename (str): filename. May or may not include directory path and/or extension.
base_id (str, optional): Optionally specify a base for the patient id. The extracted patient file id will
contain at least this base id. Raises an error if base id cannot be identified in the filename. Not
case sensitive.
Returns:
(str): the id specifying the patient (e.g. EEG43).
"""
# Remove directory from filename (if it includes one).
filename = os.path.split(filename)[1]
# Remove extension from filename (if it has an extension).
filename = filename.split('.')[0]
# Extract base id.
if base_id:
if base_id.lower() in filename.lower():
base_id = base_id.lower()
filename = filename[len(base_id):]
else:
raise ValueError('Specified base ID "{}" not found in filename "{}".'.format(base_id, filename))
else:
base_id = ''
# If the last character is a lower alphabetical character, delete it (assuming that is specifies the measurement #).
patient_file_id = filename[:-1] if filename[-1].islower() else filename
# Make ID case insensitive (convert upper case characters to lower case) and remove white spaces.
patient_file_id = ''.join(patient_file_id.lower().split())
# Take only the part before the last underscore (assuming the underscore is used to specify the measurement #).
split_parts = patient_file_id.split('_')
patient_file_id = '_'.join(split_parts[:-1]) if len(split_parts) > 1 else patient_file_id
return base_id + patient_file_id