-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess_reference_data.py
More file actions
91 lines (78 loc) · 3.49 KB
/
preprocess_reference_data.py
File metadata and controls
91 lines (78 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
Internal data preparation scripts for generating reference files.
These scripts were used to create the reference data distributed with this repo
and are not required for running the 3D neighborhood test. Users should download
the pre-built reference data as described in the README.
- prepare_hdf5(): converts a TSV of missense variants to the HDF5 reference file
- make_common_filter(): reads gnomAD v4 allele frequencies from Google Cloud Storage
and writes the common_variants_uniprot.tsv filter file
"""
import pandas as pd
import h5py
import hdf5plugin
import csv
from read_data import load_ref_for_chrom
import os
def write_dataset(fid, name, data):
dset = fid.create_dataset(
name,
data = data,
compression = hdf5plugin.Zstd(clevel=22)
)
def prepare_hdf5():
fid = h5py.File('all_missense_variants_gr38.h5','w')
variants = pd.read_csv('all_missense_variants_gr38.tsv.gz', usecols=['chrom', 'pos', 'ref', 'alt', 'aa_pos', 'aa_ref', 'aa_alt', 'uniprot_id', 'filename', 'file_pos'], sep='\t')
for chrom, df in variants.groupby('chrom'):
write_dataset(fid, f'{chrom}_ref_alt', df[['ref', 'alt', 'aa_ref', 'aa_alt']].astype('S') )
write_dataset(fid, f'{chrom}_filename', df[['filename']].astype('S'))
write_dataset(fid, f'{chrom}_uniprot_id', df[['uniprot_id']].astype('S'))
write_dataset(fid, f'{chrom}_pos', df[['pos', 'aa_pos', 'file_pos']])
fid.close()
def make_common_filter(reference_directory):
af = 'gs://missense-scoring/ancestries_gnomad_v4.tsv.bgz/ancestries_gnomad_v4.tsv.bgz'
pops = ['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'sas', 'remaining']
temp_out = 'common_variants_grch38.tsv'
first_chunk = True
for chunk in pd.read_csv(
af,
sep='\t',
compression='gzip',
chunksize=500_000
):
# compute frequencies
for pop in pops:
chunk[f'freq_{pop}'] = chunk[f'AC_{pop}'] / chunk[f'AN_{pop}']
chunk['max_af'] = chunk[[f'freq_{pop}' for pop in pops]].max(axis=1)
common = chunk.loc[chunk['max_af'] > 0.05, ['locus', 'alleles']]
# append to disk
common.to_csv(
temp_out,
sep='\t',
index=False,
quoting=csv.QUOTE_NONE,
mode='w' if first_chunk else 'a',
header=first_chunk
)
first_chunk = False
common = pd.read_csv(temp_out, sep='\t')
common['chr'] = common['locus'].str.split(':').str[0]
common['pos'] = common['locus'].str.split(':').str[1].astype(int)
common['ref'] = common['alleles'].str.split('"').str[1]
common['alt'] = common['alleles'].str.split('"').str[3]
common['Variant ID'] = common['chr'] + '-' + common['pos'].astype(str) + '-' + common['ref'] + '-' + common['alt']
result = []
ref_path = f'{reference_directory}/all_missense_variants_gr38.h5'
for chrom, rvas_data_by_chr in common.groupby('chr'):
ref = load_ref_for_chrom(ref_path, chrom, rvas_data_by_chr['pos'])
if ref is None:
continue
joined = rvas_data_by_chr.join(ref, on='Variant ID', how='inner', rsuffix='_ref')
joined = joined[['Variant ID', 'uniprot_id', 'aa_pos', 'aa_ref', 'aa_alt']]
result.append(joined)
result = pd.concat(result)
result.to_csv(f'{reference_directory}/common_variants_uniprot.tsv', sep='\t')
os.remove(temp_out)
if __name__ == "__main__":
# prepare_hdf5()
reference_directory = '../sir-reference-data/'
make_common_filter(reference_directory)