Access most recent genomic metadata .csv [05Feb2021, n=30, min_samples=20) here
For offline access: File > Download > .csv
23andMe Allele Frequency Compiler (GeneFREQSNP3.0) [05Feb2021]
Pulled from search of 28,578 23andMe chip v4 snps (v4 snps) saved in “snp.txt”
30 PSSD patient genomes analyzed (threshold for snp inclusion = 20 samples)
Compiled by PSSDLab GeneFREQSNP 3.0 (code below)
Access most recent genomic metadata .csv [05Feb2021, n=30, min_samples=20) here
# Created 06Sep2019
# Updated 05Feb2021
import os
# SET DIRECTORY
os.chdir(r"C:\Users\")
print('Directory set to \Genome')
print('---')
import time
file_count = int(input("How many genomes are being analyzed? = "))
results = open('results_freq.txt','w')
pro = 0
cac=0
lf = ""
k = []
umu = endtime = gap = 0
req = int(input("The minimum number of the SNP samples = "))
if req == 0:
req = 1
def Frequency(s):
global Apy, Cpy, Gpy, Tpy, totalp, inap
Ap = Cp = Gp = Tp = totalp = inap = 0
for h in range(file_count):
# SET DIRECTORY
os.chdir(r"C:\Users\")
file = open(str(h) + ".txt",'r')
l = file.readlines()
for line in l:
if s in line[:13]:
totalp += 2
if "C" in line[14:] :
Cp += 1
if "CC" in line[14:] :
Cp += 1
if "G" in line[14:] :
Gp += 1
if "GG" in line[14:] :
Gp += 1
if "A" in line[14:] :
Ap += 1
if "AA" in line[14:] :
Ap += 1
if "T" in line[14:] :
Tp += 1
if "TT" in line[14:] :
Tp += 1
break
if totalp >= req:
Cpy = (float(Cp) / float(totalp)) * 100
Gpy = (float(Gp) / float(totalp)) * 100
Apy = (float(Ap) / float(totalp)) * 100
Tpy = (float(Tp) / float(totalp)) * 100
results.write(s+','+str(Apy)+','+str(Cpy)+','+str(Gpy)+','+str(Tpy)+'\n')
results.flush()
else:
inap = 1
pssd_file = open("snp.txt","r")
for i in pssd_file:
cac += 1
pssd_file.seek(0)
pointime = time.perf_counter()
for x in pssd_file:
Frequency(x[:-1] + " ")
results.close()
print('Analysis complete. Thank you for using GeneFREQSNP 3.0 from the PSSDLab. Credits: Ghost, Cipro. Updated 05Feb2021')
NCBI snp Frequency Scrapper
After some tinkering, I finally found a way to compile a large dataset for comparison against data compiled by the GeneFREQSNP3.0 program.
Using the Python package “Selenium”, I was able to scrape allele frequency data from the NCBI website, which publishes the compiled snp allele frequency data of over 18,000 patients.
This Python script finalizes a long-standing issue with my method: eliminating the need to open each 23andme .txt file for each snp – a very time-intensive computational process.
Raw results can be found below (WordPress no longer allows .csv so it is in Excel format. From Excel/open office/Google Sheets it can be re-converted to .csv if needed)
# Created 07Oct2021
# Imports and update Selenium Driver
from selenium_driver_updater import DriverUpdater
from selenium import webdriver
import os
# Selenium and Path Setup
base_dir = os.path.dirname(os.path.abspath(__file__))
filename = DriverUpdater.install(path=base_dir, driver_name=DriverUpdater.chromedriver, upgrade=True, check_driver_is_up_to_date=True, old_return=False)
browser = webdriver.Chrome(filename)
# Open Correct Page
os.chdir(r"C:\) # File location
snp_strip = open('Snp_list_[11805]_05Feb2021_[txt].txt','r')
snp_strip = snp_strip.readlines()
snp_strip = [i.strip() for i in snp_strip]
# Write line
def write_line():
for rs in snp_strip:
try:
# Find goal snp page and open it
url ='https://www.ncbi.nlm.nih.gov/snp/'+rs+'#frequency_tab'
browser.get(url)
# Scrape Data
ref_allele = browser.find_element_by_xpath('.//*[@id="popfreq_datatable"]/tbody/tr[1]/td[4]').text
alt_allele = browser.find_element_by_xpath('.//*[@id="popfreq_datatable"]/tbody/tr[1]/td[5]').text
# Prepare Scrape for processing
popfreq_ref_allele_pre = [ref_allele]
popfreq_alt_allele_pre = [alt_allele]
# Process Scrape
freq_pre = [str(popfreq_ref_allele_pre).strip("[']") + ', '+ str(popfreq_alt_allele_pre).strip("[']")]
freq_pos = str(freq_pre).strip("[']")
alleles = freq_pos.split(',')
# Process for writing
a_freq = [alleles for alleles in alleles if "A" in alleles]
a_freq = str(a_freq)
a_freq = a_freq[4:-2]
if not a_freq:
a_freq = "0"
g_freq = [alleles for alleles in alleles if "G" in alleles]
g_freq = str(g_freq)
g_freq = g_freq[4:-2]
if not g_freq:
g_freq = "0"
c_freq = [alleles for alleles in alleles if "C" in alleles]
c_freq = str(c_freq)
c_freq = c_freq[4:-2]
if not c_freq:
c_freq = "0"
t_freq = [alleles for alleles in alleles if "T" in alleles]
t_freq = str(t_freq)
t_freq = t_freq[4:-2]
if not t_freq:
t_freq = "0"
line = rs + ',' + a_freq + ',' + t_freq + ',' + c_freq + ',' + g_freq
print(line)
line = line.translate({ord(c): None for c in "ACGT="})
# Set directory to Genome Folder to write the results file
os.chdir(r"C:\) # File location
results = open('ncbi_results.txt','a')
results.write(line+'\n')
results.flush()
results.close()
print(line + ' successfully added to results file')
except:
print(rs+' not found')
# Call write_line
write_line()
Analysis
Download Full .xlsx (Excel) Analysis File Below:
Sheet 1: full_results_10Oct2021
snp | a_freq_c | a_freq_p | a_diff | t_freq_c | t_freq_p | t_diff | c_freq_c | c_freq_p | c_diff | g_freq_c | g_freq_p | g_diff |
SNP | Control (NCBI) A allele frequency | PSSD A allele frequency | =abs(a_freq_c – a_freq_p) | Control (NCBI) T allele frequency | PSSD T allele frequency | =abs(t_freq_c – t_freq_p) | Control (NCBI) C allele frequency | PSSD C allele frequency | =abs(c_freq_c – c_freq_p) | Control (NCBI) G allele frequency | PSSD G allele frequency | =abs(g_freq_c – g_freq_p) |
Sheet 2: a_diff (All a_diff >50 snps, sorted largest to smallest a_diff)
Sheet 3: t_diff (All t_diff >50 snps, sorted largest to smallest t_diff)
Sheet 4: c_diff (All c_diff >50 snps, sorted largest to smallest c_diff)
Sheet 5: g_diff (All g_diff >50 snps, sorted largest to smallest g_diff)