<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;"># IMPORT STATEMENTS
import sys
sys.path.append("/Users/rohan/public_html/Hegemon")
import StepMiner as smn
import HegemonUtil as hu
import re
import numpy as np
import math
import pandas as pd
import scanpy as sc
import os
import GEOparse

def getDataInfo(path_dir=None):
    '''Get the accession ID, GPL (platform name) and filepath for the Hegemon files.'''
    if path_dir is None:
        print("Writing to current directory")
        path_dir = os.getcwd()
    accessionID = path_dir.split("/")[-1]
    gse = GEOparse.get_GEO(geo=str(accessionID), destdir=path_dir)
    gpl = list(gse.gpls.keys())[0]
    filepath = path_dir+"/"+str(accessionID)+'-%s'%(gpl)
    return accessionID, gpl, filepath

def make_idx(expr_name, idx_name):
    '''Build index file from expression file.'''
    print('Starting make_idx')
    expr = expr_name

    ptr = []
    ids = []
    name = []
    desc = []
    pos = 0

    with open(expr, 'rb') as f:
        for line in f:
            if pos == 0:
                pos += len(line)
            else:
                ptr.append(pos)
                pos += len(line)
                split = line.decode("utf-8").split('\t')
                ids.append(split[0])
                name.append(split[1].split(':')[0])
                desc.append(':'.join(split[1].split(':')[1:]))
        f.close()

    with open(idx_name, 'w') as f:
        f.write('ProbeID\tPtr\tName\tDescription\n')
        for i in range(len(ids)):
            f.write('{}\t{}\t{}\t{}\n'.format(ids[i], ptr[i], name[i], desc[i]))
        f.close()
    print("Done with make_idx")

def writeExprIdx(expr, path_dir=None):
    '''Write both index and expression file from the expression file as a dataframe.'''
    accessionID, gpl, filepath = getDataInfo(path_dir)
    expr_name = "%s-expr.txt" % filepath
    idx_name = "%s-idx.txt" % filepath
    print("Starting expr at", expr_name)
    # if NaN in data
    # expr.to_csv(expr_name, header=True, index=False,sep='\t', na_rep=0)
    expr.to_csv(expr_name, header=True, index=False,sep='\t')
    print("Done writing expr")
    print("Writing idx to", idx_name)
    make_idx(expr_name, idx_name)
    
def printConf(dbid, name, key =''):
    '''Print the information needed to add the dataset to explore.conf (configuration file).'''
    accessionID, gpl, filepath = getDataInfo()
    print("[%s]" % dbid)
    print("name = %s" % name)
    print("expr = %s-expr.txt" % filepath)
    print("index = %s-idx.txt" % filepath)
    print("survival = %s-survival.txt" % filepath)
    print("indexHeader = %s-ih.txt" % filepath)
    print("info = %s-info.txt" % filepath)
    print("key = %s" % key)
    print("source = %s" % accessionID)
    
def writeConf(dbid, name, key='', cf='/Users/rohan/public_html/Hegemon/explore.conf'):
    '''Write the information to add the dataset to explore.conf (configuration file).'''
    accessionID, gpl, filepath = getDataInfo()
    with open(cf, 'a') as f:
        f.write("\n[%s]\n" % dbid)
        f.write("name = %s\n" % name)
        f.write("expr = %s-expr.txt\n" % filepath)
        f.write("index = %s-idx.txt\n" % filepath)
        f.write("survival = %s-survival.txt\n" % filepath)
        f.write("indexHeader = %s-ih.txt\n" % filepath)
        f.write("info = %s-info.txt\n" % filepath)
        f.write("key = %s\n" % key)
        f.write("source = %s\n" % accessionID)

def getDelim(path):
    '''Get delimiter for new file.'''
    with open(path, 'r') as f:
        line1 = f.readline()
        line2 = f.readline()
        lines = line1+line2
        seps = ["\t", " ", ",", ";"]
        sepDict = {}
        for i in seps:
            sepDict[i] = lines.count(i)         
        return max(sepDict, key=sepDict.get)

def types_dict(path, header=0):
    '''Reduce memory usage by finding smallest data type of each column.'''
    sep = getDelim(path)
    df = pd.read_csv(path, nrows=3, sep=sep, header=header)
    for i in df.columns[1:]:
        df[i] = pd.to_numeric(df[i], downcast='unsigned')
    dtypes = df.dtypes
    colnames = dtypes.index
    types = [i.name for i in dtypes.values]
    column_types = dict(zip(colnames, types))
    return sep, column_types        </pre></body></html>