Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Goci 2607 tw remove bkgd trait links #3

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions trait-representation-migration/remove_background_trait_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Activate Python venv for the script - uncomment to run script on commandline
# activate_this_file = "/path/to/bin/activate_this.py"
# execfile(activate_this_file, dict(__file__ = activate_this_file))

import cx_Oracle
import contextlib
import argparse
import sys
from tqdm import tqdm
import csv
import os.path

sys.path.insert(0, '/path/to/gwas_data_sources')
import gwas_data_sources

import datetime


def read_file(filename):
'''
Read file.

Args:
filename: Name of file provided as a commandline argument.

Returns:
data_map: Dictionary with the STUDY_ID as the key and a
list of EFO_IDs as the dictionary value.
'''

with open(filename, 'r') as file:
lines = file.readlines()[1:]

return lines


def process_file_contents(file_data, efo_map, cursor):
'''
Extract fields of interest from the file and clean-up values.

Args:
file_data: Contents of the file.
'''

count = 0

for line in file_data:
formatted_line = line.split('\t')

study_accession = formatted_line[2].strip()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line assumes this script will only be run on already published studies with accession IDs. If this assumption is correct there's nothing to do.

study_id = _get_study_id(study_accession)

background_column = formatted_line[4].strip()

delimiter = '||'

if not background_column == '':
count += 1
# print('\nCount: {} StudyID: {} Accession: {}'.format(count, study_id, study_accession))

if delimiter in background_column:
background_column = background_column.split(delimiter)

for background_trait in background_column:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is unnecessary code duplication. You don't need to check if the delimiter is in the string or not.

string = 'acute graft vs. host disease || donor genotype effect measurement' 

delimiter = '||'
for trait in string.split(delimiter):
    print("Trait: {}".format(trait.strip()))
# Trait: acute graft vs. host disease
# Trait: donor genotype effect measurement
 
delimiter = '&&'
for trait in string.split(delimiter):
    print("Trait: {}".format(trait.strip()))
# Trait: acute graft vs. host disease || donor genotype effect measurement    

Works both cases.

# Get the EFO_ID
background_trait_id = efo_map[background_trait.strip().lower()]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a suggestion: given the file is (more or less) manually created, I would add a test to check if the trait name is in the hash. But that's not particularly important.

# print('TID: {}'.format(background_trait_id))
_execute_delete_query(cursor, study_id, background_trait_id)
else:
# Get the EFO_ID
background_trait_id = efo_map[background_column.strip().lower()]
# print('TID: {}'.format(background_trait_id))
_execute_delete_query(cursor, study_id, background_trait_id)


def database_connection(DATABASE_NAME):
'''
Connect to the database and returns a cursor object.

Args:
database_name (str): The name of the database.

Raises:
DatabaseError: Error reponse if unable to connect to the database.

Returns:
cursor: Database cursor object.
'''

try:
ip, port, sid, username, password = gwas_data_sources.get_db_properties(DATABASE_NAME)
dsn_tns = cx_Oracle.makedsn(ip, port, sid)
connection = cx_Oracle.connect(username, password, dsn_tns)

cursor = connection.cursor()

return connection, cursor

except cx_Oracle.DatabaseError, exception:
print exception


def get_efo_id_map(cursor):
'''
Get mapping of term labels to EFO IDs.

Args:
cursor (object): Database cursor object.


Returns:
efo_map: A mapping of term labels to it's EFO ID
'''
efo_map = {}

efo_sql = '''
SELECT ID, LOWER(TRAIT)
FROM EFO_TRAIT
'''

cursor.execute(efo_sql)
efo_data = cursor.fetchall()

for row in tqdm(efo_data, desc='Build EFO map'):
# Key is Trait, Value is ID
efo_map[row[1]] = row[0]

return efo_map


def _get_study_id(accession):
'''
Query the STUDY table with the accession to get the Study ID.

Args:
accession (str): study accession parsed from input file

Returns:
study_id (int): STUDY.ID, the primary key for the row with the study accession.
'''

study_sql = '''
SELECT ID
FROM STUDY
WHERE ACCESSION_ID = '{}'
'''.format(accession)

cursor.execute(study_sql)
study_id = cursor.fetchone()

return study_id[0]



def _execute_delete_query(cursor, study_id, efo_trait_id):
'''
Delete row from STUDY_EFO_TRAIT table.

Args:
query (str): The query to run.

Raises:
'''

study_efo_trait_delete_sql = '''
DELETE FROM STUDY_EFO_TRAIT
WHERE STUDY_ID = '{}' AND EFO_TRAIT_ID = '{}'
'''.format(study_id, efo_trait_id)

cursor.execute(study_efo_trait_delete_sql)

# commit or rollback changes
if args.mode == 'production':
cursor.execute('COMMIT')
else:
cursor.execute('ROLLBACK')


if __name__ == '__main__':
'''
Remove background trait link to Studies.
'''

# Commandline arguments
parser = argparse.ArgumentParser()
parser.add_argument('--filename', default='study_background_traits-ALL.txt',
help='Name of data file (default: study_background_traits).')
parser.add_argument('--database', default='DEV3', choices=['DEV3', 'SPOTPRO'],
help='Run as (default: DEV3).')
parser.add_argument('--mode', default='debug', choices=['debug', 'production'],
help='Run as (default: debug).')
args = parser.parse_args()

global DATABASE_NAME
DATABASE_NAME = args.database

# Get database connection
conn, cursor = database_connection(args.database)

# Create map of Trait labels and EFO_IDs
efo_map = get_efo_id_map(cursor)

# Read data file
data = read_file(args.filename)

# Format column values and remove STUDY_EFO_TRAIT link
process_file_contents(data, efo_map, cursor)

# Close database connection
conn.close()



Loading