#include #include #include #include #include //#include //#include //using namespace std; const char FILE_NAME[] = "Homo_sapiens.NCBI36.48.dna.chromosome.13.fa"; const char EST_FILE[] = "reads1000.fa"; //const char FILE_NAME[] = "afseq.fasta"; //const char EST_FILE[] = "afest.fasta"; char seq_name[100]; // To store name of sequence after '>' before '\n' #define EMPTY -1 #define ANOTHERPRIME 8191 #define HASHLENGTH 13 // number of characters to use for hashing #define REPEATNUM 20 // number of repeat words to calculate and display #define MAX_EST_LENGTH 100 #define MAX_EST_NAME_LENGTH 40 #define COLLISION_THRESHOLD 50 // deal with repeats, maximum collision times #define GAP_NUM_THRESHOLD 3 #define e 2.718281828459 // used by hashFunc #define mu 0.33 // used by smith_waterman #define delta 1.33 // used by smith_waterman #define MATCH_PERCENT_THRESHOLD 95 // use this to determine mapped/found or not /**** global variables*******/ long long int probeCounter = 0; // probe counts int funcEvaNum = 0; // function evaluation time int hashSize = 0; // hash table size int estNum = 0; // EST number int estSeqCount = 0, nameCount = 0, alignmentNum = 0, estFoundNum = 0, est_checker = 0; int ind; // used by smith_waterman int checker; // used by hashSearch /**** function prototypes*****/ int charCount(FILE *in_file); int hashCal(int a); int encode(int b); bool check(int item, int *hashTable, int h, int checkNum); void Insert( int value, int i, int *hashTable, int *locTable); int hashFunc(int *intTable, int i, int length); int estCount(FILE *est_file); void estChar_count_store(FILE *est_file, char *estCharTable, char *estName); void reverse(char *estCharTable, char *rc_estCharTable); char complement(char a); void hashSearch(int value, int *hashTable, int *locTable, char *charTable, char *estCharTable, int count, int *estIntTable, int i, int rc); void kewordsCal(int count, int *intTable, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable, int *keywordTable, int *keywordCount); bool keyword_check(int item, int *hashTable, int h, int *keyword_countTable); void keyword_Insert( int value, int i, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable); int hash_value_location(int *keyword_hashTable, int value, int count); void repeat_cal_main(int count, int *intTable, char *charTable); double similarity_score(char a, char b); double find_array_max(double array[], int length); void print_alignment(double match_percentage, int tick, char *consensus_a, char *consensus_b, int *locTable, int h, int rc); void mismatch_tolower(char *consensus_a, char *consensus_b, int tick); void smith_waterman(char *seq_a, char *seq_b, int N_a, int N_b, int *locTable, int h, int rc); ///////////////////////////////////////////////////////////////////////////// int main() { clock_t startTime, finishTime; double duration; startTime = clock(); /***************************** The above three lines are for timing ************************/ FILE *in_file; // input file in_file = fopen(FILE_NAME, "r"); if (in_file == NULL) { printf("Cannot open %s\n", FILE_NAME); exit(8); } int count = charCount(in_file); // Count number of characters rewind(in_file); // Return to the beginning of the file printf("\nNumber of characters in %s is %d\n", FILE_NAME, count); printf("Allocating memory and saving data to charTable[count]...\n"); char *charTable = new char[count]; // Allocate memory for the characters in file int ch; // character or EOF flag from input while ( ( ch = fgetc(in_file)) != '\n' ) ; // skip the first line for (int i = 0; i < count; i++) // Read each item into appropriate location fscanf(in_file, "%c ", &charTable[i]); for (int i = 0; i < count; i++) charTable[i] = toupper(charTable[i]); printf("Characters from file scanned, toUppered and saved in an array\n"); fclose(in_file); // Done with the file // Now convert char to int and store it in another array printf("\nEncoding char to int and storing them in a new array: intTable[count]...\n"); int *intTable = new int[count]; for (int i = 0; i < count; i++) intTable[i] = encode((int)charTable[i]); // convert ATCG to corresponding numbers defined by encode(int) printf("Encoding done. Integers stored.\n"); // Now let's deal with the hash table... hashSize = hashCal(count); // get hash table size printf("hashSize = %d\n", hashSize); /*****calculate and print top repeat words****** repeat_cal_main(count, intTable, charTable); ***********************************************/ probeCounter = 0; int *hashTable = new int[hashSize]; printf("\nHash Table Size = %d\n", hashSize); printf("Initializing hash table hashTable[hashSize]...\n"); for (int i = 0; i < hashSize; i++) hashTable[i] = EMPTY; printf("Hash table initialized.\n"); printf("Initializing locTable[hashSize]...\n"); int *locTable = new int[hashSize]; // to keep track of the location on genome printf("Location table initialized.\n"); printf("\nindexing...\n"); for (int i = 0; i <= count - HASHLENGTH; i++) { int value = hashFunc(intTable, i, HASHLENGTH); Insert(value, i, hashTable, locTable); } printf("\nIndexing done.\n"); delete [] intTable; // done with intTable, release some memory printf("\nProbe counts: %Ld\n", probeCounter); printf("Total hash function evaluation times: %d\n", funcEvaNum); /***************************** The below three lines are for timing ************************/ finishTime = clock(); duration = (double)(finishTime - startTime) / CLOCKS_PER_SEC; printf("Indexing time: %3.2f seconds\n", duration); //////////////////////////////////////////////////////////////////////////////////////////// clock_t est_startTime, est_finishTime; double est_duration; est_startTime = clock(); /***************************** The above three lines are for timing ************************/ // Now let's deal with the EST file FILE *est_file; // input file char estName[MAX_EST_NAME_LENGTH]; // store the name of the EST, assuming it's < 100 characters char *estCharTable = new char[MAX_EST_LENGTH]; // store forward EST seq char *rc_estCharTable = new char[MAX_EST_LENGTH]; // store reverse complement EST seq est_file = fopen(EST_FILE, "r"); if (NULL == est_file) { printf("Cannot open %s\n", EST_FILE); exit(8); } estCount(est_file); // First count EST number and set global variable estNum printf("******************************************************************************\n"); for (int j = 0; j < estNum; j++) // Now let's process one EST at a time! { est_checker = 0; // reset to 1 when print alignment, to make sure each EST gets counted once checker = 1; // reset to not found estChar_count_store(est_file, estCharTable, estName); // count and store characters in EST. Used for counting how many ESTs are mapped reverse(estCharTable, rc_estCharTable); // perform and store reverse complement int *estIntTable = new int[estSeqCount]; // forward sequence for (int i = 0; i < estSeqCount; i++) estIntTable[i] = encode((int)estCharTable[i]); int *rc_estIntTable = new int[estSeqCount]; // reverse complement sequence for (int i = 0; i < estSeqCount; i++) rc_estIntTable[i] = encode((int)rc_estCharTable[i]); // Now let's map the EST!!! for (int i = 0; i < estSeqCount - HASHLENGTH; i++) //0~12,1~13,2~14... { if (checker == 1) // not found { // process forward sequence, rc = 0 hashSearch(hashFunc(estIntTable, i, HASHLENGTH), hashTable, locTable, charTable, estCharTable, count, estIntTable, i, 0); // process reverse complement sequence, rc = 1 hashSearch(hashFunc(rc_estIntTable, i, HASHLENGTH), hashTable, locTable, charTable, rc_estCharTable, count, rc_estIntTable, i, 1); } } delete [] estIntTable; delete [] rc_estIntTable; } /***************************** The below two lines are for timing ************************/ est_finishTime = clock(); est_duration = (double)(est_finishTime - est_startTime) / CLOCKS_PER_SEC; printf("******************************************************************************\n"); printf("Number of ESTs: %d\n", estNum); //estFoundNum = estNum- estFoundNum; printf("Number of mapped ESTs: %d\n", estFoundNum); printf("Number of alignments: %d\n", alignmentNum); printf("EST searching time: %3.2f seconds\n", est_duration); printf("Efficiency: %7.0f ESTs/min", 60*estNum/est_duration); //////////////////////////////////////////////////////////////////////////////////////////// fclose(est_file); // Done with the EST file delete [] charTable; // Release memory delete [] hashTable; delete [] locTable; printf("\n\nMemory released. Job done.\n\n"); return 0; } ///////////////////////////////////////////////////////////////////////////// int hashCal(int a) { /* Prime number source: http://planetmath.org/encyclopedia/GoodHashTablePrimes.html each number in the list is prime each number is slightly less than twice the size of the previous each number is as far as possible from the nearest two powers of two */ int goodPrimes[26] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741 }; for (int i = 0; i < 26; i++) { if (a < 0.6 * goodPrimes[i]) // Determines a good hash table size { hashSize = goodPrimes[i]; break; } } return hashSize; } ///////////////////////////////////////////////////////////////////////////// int encode(int b) { if (b == 65) {return 2;} // A -> 2 else if (b == 67) {return 3;} // C -> 3 else if (b == 71) {return 4;} // G -> 4 else if (b == 84) {return 5;} // T -> 5 else {return 1;} } // Checking collision bool check(int item, int *hashTable, int h, int checkNum) { if (checkNum < COLLISION_THRESHOLD) { probeCounter ++; // To keep track of the number of probes return hashTable[h] != EMPTY; } else probeCounter ++; return 0; } // Adding an integer using double hashing … void Insert( int value, int i, int *hashTable, int *locTable) { int item = value; int h = value % hashSize; int checkNum = 0; int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME while (check(item, hashTable, h, checkNum)) { h = (h + h2) % hashSize; checkNum++; } //printf("%d [position] with item %d [value]\n", h, value); if (hashTable[h] == EMPTY) { hashTable[h] = item; locTable[h] = i; } } // Hashing function int hashFunc(int *intTable, int i, int length) { long long int value = 0; for (int j = 0; j < length; j++) value = value * 5 + intTable[i + j]; //if (i < 100) //printf("\nvalue %% hashSize = %Ld ", value % hashSize); funcEvaNum++; return ( int) value; } // Count characters int charCount(FILE *in_file) { int count = 0; // number of characters seen // character or EOF flag from input int ch; if (( ch = fgetc(in_file)) == '>') { while ( ( ch = fgetc(in_file)) != '\n' ) // skip first line header from file after '>' ; while (1) { ch = fgetc(in_file); if (ch == '\n' || ch == ' ') ; else if (ch == EOF) break; else ++count; } } return count; } // Count the Number of EST in a file int estCount(FILE *est_file) { int ch; while(1) { ch = fgetc(est_file); if (ch == '>') estNum++; else if (ch == EOF) break; } printf("\nNumber of ESTs in %s is %d\n", EST_FILE, estNum); rewind(est_file); // reset pointer position to the beginning of file return estNum; } // Count and store a EST in an array void estChar_count_store(FILE *est_file, char *estCharTable, char *estName) { int ch; int seen = 0; // seen '>' how many times int i = 0; estSeqCount = 0, nameCount = 0; while (seen != 2) // an EST is the sequence between two '>'s { if ((ch = fgetc(est_file)) == '>') { seen++; if (seen == 1) { for (int i = 0; (ch = fgetc(est_file)) != '\n'; i++) { estName[i] = ch; // store EST name nameCount++; // count EST name length } } } else if (ch == 'A' || ch == 'a' || ch == 'T' || ch == 't' || ch == 'C' || ch == 'c' || ch == 'G' || ch == 'g' || ch == 'N' || ch == 'n' || ch == '?') { estCharTable[i] = toupper(ch); // toUpper and save i++; estSeqCount++; } else if (ch == EOF) break; } fseek (est_file, -1, SEEK_CUR); // reset pointer position to before '>' printf(">"); for (int i = 0; i < nameCount; i++) printf("%c", estName[i]); printf("\nLength = %d\n", estSeqCount); /* // For debugging prints out first and last 20 characters of EST for (int i = 0; i < 15; i++) printf("%c", estCharTable[i]); printf("......"); for (int i = estSeqCount - 15; i < estSeqCount; i++) printf("%c", estCharTable[i]); */ } void reverse(char *estCharTable, char *rc_estCharTable) { for (int i = 0; i < estSeqCount; i++) rc_estCharTable[i] = complement(estCharTable[estSeqCount - i - 1]); } char complement(char a) { if (a == 'A') {return 'T';} // A -> T else if (a == 'C') {return 'G';} // C -> G else if (a == 'G') {return 'C';} // G -> C else if (a == 'T') {return 'A';} // T -> A else return a; } ////////////////////////////////////////////////////////////////////////////////////// void hashSearch(int value, int *hashTable, int *locTable, char *charTable, char *estCharTable, int count, int *estIntTable, int i, int rc) { int item = value; int h = value % hashSize; int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME while (1) { if (hashTable[h] == item) { smith_waterman(charTable + locTable[h] - i, estCharTable, estSeqCount + 5, estSeqCount, locTable, h, rc); h = (h + h2) % hashSize; while(hashTable[h] != EMPTY) { if (hashTable[h] == item) { smith_waterman(charTable + locTable[h] - i, estCharTable, estSeqCount + 5, estSeqCount, locTable, h, rc); } h = (h + h2) % hashSize; } checker = 0; // if found then there is no need to do further hash search break; } else if (hashTable[h] == EMPTY) break; else h = (h + h2) % hashSize; } } //////////////////////////////////////////////////////////////////////////////////////////// void kewordsCal(int count, int *intTable, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable, int *keywordTable, int *keywordCount) { for (int i = 0; i < hashSize; i++) keyword_hashTable[i] = EMPTY; // initializing keyword_hashTable for (int i = 0; i < hashSize; i++) keyword_countTable[i] = 0; // initializing keyword_countTable printf("indexing for repeat words...\n"); for (int i = 0; i <= count - HASHLENGTH; i++) { int value = hashFunc(intTable, i, HASHLENGTH); keyword_Insert(value, i, keyword_hashTable, keyword_countTable, keyword_locTable); } printf("Calculating top repeat words...\n"); for (int i = 0; i < REPEATNUM; i++) { int max = 0, max_location = 0; for (int j = 0; j < hashSize; j++) { if (max < keyword_countTable[j]) { max = keyword_countTable[j]; max_location = j; } } keywordCount[i] = max; keywordTable[i] = keyword_hashTable[max_location]; keyword_countTable[max_location] = 0; } } ///////////////////////////////////////////////////////////////////////////// bool keyword_check(int item, int *keyword_hashTable, int h, int *keyword_countTable) { probeCounter++; if (keyword_hashTable[h] == item) keyword_countTable[h]++; return keyword_hashTable[h] != item && keyword_hashTable[h] != EMPTY; } // Adding an integer using double hashing … void keyword_Insert(int value, int i, int *keyword_hashTable, int *keyword_countTable, int *keyword_locTable) { int item = value; int h = value % hashSize; int h2 = value % ANOTHERPRIME + 1; // The other prime number: ANOTHERPRIME while (keyword_check(item, keyword_hashTable, h, keyword_countTable)) { h = (h + h2) % hashSize; } //printf("%d [position] with item %d [value]\n", h, value); if (keyword_hashTable[h] == EMPTY) { keyword_hashTable[h] = item; // insert into hash table keyword_locTable[h] = i; // keep track of the location on the genome } } ///////////////////////////////////////////////////////////////////////////// int hash_value_location(int *keyword_hashTable, int value, int count) { int i = 0; for (i = 0; i < count; i++) { if (value == keyword_hashTable[i]) break; } return i; } void repeat_cal_main(int count, int *intTable, char *charTable) { // Now let's take a minute and calculate some common key words/////////////////////////// int *keyword_hashTable = new int[hashSize]; // for inserting values int *keyword_countTable = new int[hashSize]; // to keep track of the number of collisions int *keyword_locTable = new int[hashSize]; // to keep track of the location on genome int *keywordTable = new int[REPEATNUM]; // to keep track of keywords int *keywordCount = new int[REPEATNUM]; // to keep track of number of each keyword kewordsCal(count, intTable, keyword_hashTable, keyword_countTable, keyword_locTable, keywordTable, keywordCount); // print out top repeat words printf("\nTop %d repeats:\n", REPEATNUM); for (int j = 0; j < REPEATNUM; j++) { printf("%3d ", j + 1); int hashValueLocation = hash_value_location(keyword_hashTable, keywordTable[j], hashSize); for (int i = 0; i < HASHLENGTH; i++) printf("%c", charTable[i + keyword_locTable[hashValueLocation]]); printf("... Repeat times: %d\n", keywordCount[j]); } delete [] keyword_locTable; delete [] keyword_hashTable; // release memory delete [] keyword_countTable; // release memory printf("\nRepeat calculation probe counts: %Ld\n", probeCounter); // Done with calculating keywords///////////////////////////////////////////////////////// } // The code below was adapted from //https://wiki.uni-koeln.de/biologicalphysics/index.php/Implementation_of_the_Smith-Waterman_local_alignment_algorithm void smith_waterman(char *seq_a, char *seq_b, int N_a, int N_b, int *locTable, int h, int rc) { // initialize H double H[MAX_EST_LENGTH][MAX_EST_LENGTH]; for(int i=0;i<=N_a;i++){ for(int j=0;j<=N_b;j++){ H[i][j]=0; } } double temp[4]; int I_i[MAX_EST_LENGTH][MAX_EST_LENGTH],I_j[MAX_EST_LENGTH][MAX_EST_LENGTH]; // Index matrices to remember the 'path' for backtracking // here comes the actual algorithm for(int i=1;i<=N_a;i++){ for(int j=1;j<=N_b;j++){ temp[0] = H[i-1][j-1]+similarity_score(seq_a[i-1],seq_b[j-1]); temp[1] = H[i-1][j]-delta; temp[2] = H[i][j-1]-delta; temp[3] = 0; H[i][j] = find_array_max(temp,4); switch(ind){ case 0: // score in (i,j) stems from a match/mismatch I_i[i][j] = i-1; I_j[i][j] = j-1; break; case 1: // score in (i,j) stems from a deletion in sequence A I_i[i][j] = i-1; I_j[i][j] = j; break; case 2: // score in (i,j) stems from a deletion in sequence B I_i[i][j] = i; I_j[i][j] = j-1; break; case 3: // (i,j) is the beginning of a subsequence I_i[i][j] = i; I_j[i][j] = j; break; } } } /* // Print the matrix H to the console cout<<"**********************************************"<H_max){ H_max = H[i][j]; i_max = i; j_max = j; } } } // Backtracking from H_max int current_i=i_max,current_j=j_max; int next_i=I_i[current_i][current_j]; int next_j=I_j[current_i][current_j]; int tick = 0; int gap_num = 0; double mismatch_num = 0; double match_percentage; char consensus_a[N_a+N_b+2],consensus_b[N_a+N_b+2]; while(((current_i!=next_i) || (current_j!=next_j)) && (next_j!=-1) && (next_i!=-1)) { if(next_i==current_i) { gap_num++; consensus_a[tick] = '-'; // deletion in A } else consensus_a[tick] = seq_a[current_i-1]; // match/mismatch in A if(next_j==current_j) { gap_num++; consensus_b[tick] = '-'; // deletion in B } else consensus_b[tick] = seq_b[current_j-1]; // match/mismatch in B current_i = next_i; current_j = next_j; next_i = I_i[current_i][current_j]; next_j = I_j[current_i][current_j]; tick++; //printf("tick = %d\n", tick); } mismatch_num = (tick - H_max - gap_num * (delta + 1)) / (mu + 1); // mismatch number match_percentage = 100*(tick - gap_num - mismatch_num)/estSeqCount; // percentage of matches //printf("gap number = %d ", gap_num); //printf("mismatch number = %f ", mismatch_num); if (match_percentage > MATCH_PERCENT_THRESHOLD & gap_num < GAP_NUM_THRESHOLD) { if (match_percentage != 100) // If there is mismatch, then tolower it! mismatch_tolower(consensus_a, consensus_b, tick); // to increase the readability print_alignment(match_percentage, tick, consensus_a, consensus_b, locTable, h, rc); //printf(" estSeqCount = %d", estSeqCount); //printf(" gap_num = %d", gap_num); //printf(" mismatch_num = %2.1f\n", mismatch_num); } } ///////////////////////////////////////////////////////////////////////////// double similarity_score(char a,char b){ double result; if(a==b){ result=1; } else{ result=-mu; } return result; } ///////////////////////////////////////////////////////////////////////////// double find_array_max(double array[],int length){ double max = array[0]; // start with max = first element ind = 0; for(int i = 1; i max){ max = array[i]; ind = i; } } return max; // return highest value in array } ////////////////////////////////////////////////////////////////////////////// void mismatch_tolower(char *consensus_a, char *consensus_b, int tick) { for(int i=tick-1;i>=0;i--) { if ((consensus_a[i] != consensus_b[i]) & (consensus_a[i] != '-') & (consensus_b[i] != '-')) { consensus_a[i] = tolower(consensus_a[i]); consensus_b[i] = tolower(consensus_b[i]); } } } ////////////////////////////////////////////////////////////////////////////// void print_alignment(double match_percentage, int tick, char *consensus_a, char *consensus_b, int *locTable, int h, int rc) { if (est_checker == 0) { estFoundNum++; est_checker = 1; // reset to 1 to make sure each EST gets counted once, avoid repeat counts } alignmentNum++; // keep track of number of alignments printed //*****************output******************** //printf("tick = %d////////////////////////////////////\n", tick); //cout<<"H_max = "<=0;i--) printf("%c", consensus_a[i]); printf("\n"); if (rc == 0) printf(" EST: "); else printf("*rc_EST: "); // reverse complement sequence, add "*rc_" for easy recognition for(int j=tick-1;j>=0;j--) printf("%c", consensus_b[j]); printf("\n"); printf("Mapped to %d ~ %d, ", locTable[h] + 1, locTable[h] + tick); printf("percentage of matches = %2.0f%%\n", match_percentage); } ///////////////////////////////////////////////////////////////////////////// // above four functions are auxiliary functions used by smith_waterman: // /////////////////////////////////////////////////////////////////////////////