/********************************************* GENOME RANDOMIZER Copyright 2006 Jan Mrazek (mrazek@uga.edu) *********************************************/ /************************************************************************/ /* THIS PROGRAM IS DISTRIBUTED "AS IS" WITH NO WARRANTY, */ /* OR SUPPORT, WITHOUT EVEN THE IMPLIED WARRANTY */ /* OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. */ /* THE AUTHOR SHALL NOT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ /* LIABILITY, IN CONNECTION WITH THIS PROGRAM. */ /* */ /* YOU CAN MODIFY AND REDISTRIBUTE THIS PROGRAM UNDER THE TERMS OF */ /* THE GNU GENERAL PUBLIC LICENSE (http://www.gnu.org/licenses/gpl.txt) */ /* */ /* Please cite Mrazek, J. (2006) Analysis of distribution indicates */ /* diverse functions of simple sequence repeats in Mycoplasma genomes, */ /* Mol Biol. Evol. 23:1370-1385 if you use this program in a */ /* published work. */ /************************************************************************/ /********************************************************************************/ /* INSTALLATION INSTRUCTIONS: */ /* Compile with a command "cc genrand.c -fsigned-char -o ~/bin/genrand" */ /* or a suitable modification. gcc compiler should work fine, too. */ /* The program was tested on Redhat Linux. */ /* Run as "genrand ", where */ /* is an input sequence in GenBank format (e.g., */ /* ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K12/NC_000913.gbk), */ /* is the desired output file name, and is the seed */ /* for random number generator (using the same seed allows you to reproduce */ /* the same randomized sequence). */ /* Example: genrand NC_000913.gbk E.coli_K12 1 */ /********************************************************************************/ /******************************************************************************** History and modifications of this program: Modified on 7/14/2006 by Jan Mrazek (mrazek@uga.edu) 3/26/2007 by J.M.: fixed a problem leading to segmentation fault if stack limit is set too low. *********************************************************************************/ /* Reads a sequence and annotation in GenBank format and generates 11 random sequences using different models as follows: homogeneous ("global") models (applied to the whole genome/chromosome): rndb: simple bernoulli model (independently drawn letters with nonuniform probabilities) rndm1: first order Markov (reproducing dinucleotide frequencies) rndm3: third order Markov (reproducing tetranucleotide frequencies) rndm5: fifth order Markov (reproducing hexanucleotide frequencies) heterogeneous ("local") models (each gene/orf/exon and each intergenic region randomized separately): rndbb: Bernoulli model for both coding and noncoding rndbbp: as previous but separating different codon positions rndm1m1: first order Markov for both coding and noncoding rndm1m1p: as previous but separating different codon positions rmdm1c: Markov for intergenic, whole codons for genes rndm1c1: Markov for intergenic, codons dependent on preceding nucleotide for genes rndm3m3p: third order Markov for intergenic, position-dependent third order Markov for genes; pseudocounts are added equal to mononucleotide frequencies. For all Markov models, if a transition probability is not defined due to small sample size, the non-Markov probabilities are used instead. */ /* Note: This program is intended for use with prokaryotic genomes. It can also be applied to eukaryotic genomes with short introns and intergenic sequences. The heterogeneous models used in this program don't make much sense for genomes where some of the segments (exons, introns or intergenic regions) are very large. */ #include #include #include #include #define MAXLINE 513 #define MAXSEQLEN 12000000 // maximum length of the analyzed sequence #define MAXEXONS 50000 // maximum number of annotated exons #define MAXCDSLEN 50000 // maximum length of a single CDS #define MAXLOCI 20000 // maximum number of sequence entries in the input file ("LOCUS" line) #define MAXGENES 20000 // maximum number of annotated genes (CDS) /********************************************************************/ /* Global declarations */ /********************************************************************/ struct ExonSt { long Gene; long Start; long End; long GeneStart; long GeneEnd; char Locus[13]; }; struct LocusStr { long Start; char Locus[13]; }; static char Seq[MAXSEQLEN+2]; struct ExonSt Exon[MAXEXONS]; struct LocusStr Loc[MAXLOCI]; char SeqFiName[MAXLINE]; long LSeq,NExons,LCDS,NLoci; char NTDef[16][5]= { {0,0,0,0,0}, {1,1,0,0,0}, /* A (code 1) comprises 1 nucleotide A (code 1) */ {1,2,0,0,0}, /* C (code 2) comprises 1 nucleotide C (code 2) */ {1,3,0,0,0}, /* G (code 3) comprises 1 nucleotide G (code 3) */ {1,4,0,0,0}, /* T (code 4) comprises 1 nucleotide T (code 4) */ {2,2,3,0,0}, /* S (code 5) comprises 2 nucleotides C,G (codes 2,3) */ {2,1,4,0,0}, /* W (code 6) comprises 2 nucleotides A,T (codes 1,4) */ {2,1,3,0,0}, /* R (code 7) comprises 2 nucleotides A,G (codes 1,3) */ {2,1,2,0,0}, /* M (code 8) comprises 2 nucleotides A,C (codes 1,2) */ {2,3,4,0,0}, /* K (code 9) comprises 2 nucleotides G,T (codes 3,4) */ {2,2,4,0,0}, /* Y (code 10) comprises 2 nucleotides C,T (codes 2,4) */ {3,2,3,4,0}, /* B (code 11) comprises 3 nucleotides C,G,T (codes 2,3,4) */ {3,1,3,4,0}, /* D (code 12) comprises 3 nucleotides A,G,T (codes 1,3,4) */ {3,1,2,4,0}, /* H (code 13) comprises 3 nucleotides A,C,T (codes 1,2,4) */ {3,1,2,3,0}, /* V (code 14) comprises 3 nucleotides A,C,G (codes 1,2,3) */ {4,1,2,3,4} }; /* N (code 15) comprises 4 nucleotides A,C,G,T (codes 1,2,3,4) */ /********************************************************************/ /* NTtoI */ /********************************************************************/ char NTtoI ( NT ) char NT; { /* assigns integer numbers to nucleotides acording to the following code: A ... 1 C ... 2 G ... 3 T ... 4 S ... 5 W ... 6 R ... 7 M ... 8 K ... 9 Y ... 10 B ... 11 D ... 12 H ... 13 V ... 14 N ... 15 any other character is assigned 0. */ if ( NT=='T' || NT=='t' ) return(4); if ( NT=='G' || NT=='g' ) return(3); if ( NT=='C' || NT=='c' ) return(2); if ( NT=='A' || NT=='a' ) return(1); if ( NT=='S' || NT=='s' ) return(5); if ( NT=='W' || NT=='w' ) return(6); if ( NT=='R' || NT=='r' ) return(7); if ( NT=='M' || NT=='m' ) return(8); if ( NT=='K' || NT=='k' ) return(9); if ( NT=='Y' || NT=='y' ) return(10); if ( NT=='B' || NT=='b' ) return(11); if ( NT=='D' || NT=='d' ) return(12); if ( NT=='H' || NT=='h' ) return(13); if ( NT=='V' || NT=='v' ) return(14); if ( NT=='N' || NT=='n' ) return(15); return(0); } /********************************************************************/ /* */ /* GetCDSPos */ /* */ /********************************************************************/ int GetCDSPos(IFi,Exon,Gene) FILE *IFi; struct ExonSt *Exon; long Gene; /* fills the structure Exon with the exon start, end, and serial number of gene it belongs to. Returns the number of exons newly filled in this step. Return value means: >0 ... number of exons if o.k. -1 ... no next CDS in the file. -2 ... no next CDS untill "//" line. -3 ... CDS runs beyond the end of the sequence (end of CDS > sequence length). -7 ... With the "join" keyword, the " CDS" line should continue on the next line but does not. -9 or -99 ... Undefined symbol in " CDS" line. -97 or -98 ... Undefined symbol in " CDS" line with the "join" keyword. */ { char Line[MAXLINE]; int I,IQEnd; long IExon; if (fgets(Line,MAXLINE,IFi) == NULL) { return(-1); }; while ( Line[5]!='C' || Line[6]!='D' || Line[7]!='S' || Line[0]!=' ' || Line[1]!=' ' || Line[2]!=' ' || Line[3]!=' ' || Line[4]!=' ' ) { if (fgets(Line,MAXLINE,IFi) == NULL) { return(-1); }; if (Line[0]=='/' && Line[1]=='/') return(-2); }; /* First next line begining with " CDS" is now in Line */ if (isdigit(Line[21])) { /* The simplest case like "124..567" or "124..>566" */ Exon[0].Start=atol(Line+21); I=22; while ( Line[I]!='.' ) ++I; while ( Line[I]=='.' || Line[I]==' ' ) ++I; if ( isdigit(Line[I]) ) { Exon[0].End=atol(Line+I); } /* else if (Line[I]=='>') { Exon[0].End=atol(Line+I+1); } */ else { return(-99); }; Exon[0].Gene=Gene; return(1); } else if (Line[21]=='c' && Line[31]=='(' && Line[32]!='j') { /* The case with the "complement" keyword like "complement(124..567)" or "complement(<124..566)" */ if ( isdigit(Line[32]) ) { Exon[0].End=atol(Line+32); } /* else if ( Line[32]=='<' ) { Exon[0].End=atol(Line+33); } */ else { return(-99); }; I=33; while ( Line[I]!='.' ) ++I; while ( Line[I]=='.' || Line[I]==' ' ) ++I; if ( isdigit(Line[I]) ) { Exon[0].Start=atol(Line+I); } else { return(-99); }; Exon[0].Gene=Gene; return(1); } else if (Line[21]=='j' && Line[25]=='(') { /* The case with the "join" keyword */ I=26; IQEnd=0; IExon=0; while (IQEnd==0) { while (Line[I]==' ') ++I; if (isdigit(Line[I])) { Exon[IExon].Start=atol(Line+I); while ( Line[I]!='.' ) ++I; while ( Line[I]=='.' || Line[I]==' ' ) ++I; if ( isdigit(Line[I]) ) { Exon[IExon].Gene=Gene; Exon[IExon].End=atol(Line+I); ++IExon; } else { return(-97); }; } else if ( Line[I]=='c' && Line[I+10]=='(' ) { I=I+11; ++IExon; if ( isdigit(Line[I]) ) { Exon[IExon].End=atol(Line+I); } else { return(-97); }; ++I; while ( Line[I]!='.' ) ++I; while ( Line[I]=='.' || Line[I]==' ' ) ++I; if ( isdigit(Line[I]) ) { Exon[IExon].Gene=Gene; Exon[IExon].Start=atol(Line+I); ++IExon; } else { return(-97); }; while (Line[I]!=')') ++I; ++I; } else { return(-98); }; while ( Line[I]!=',' && Line[I]!='\0' && I<=MAXLINE ) ++I; if (Line[I]!=',') { IQEnd=1; } else { ++I; while (Line[I]==' ' && I=MAXLOCI) printf ("\n\nToo many sequence entries in the input file.\n\n"); ++NLoci; strncpy(Loc[NLoci].Locus,Locus,12); Loc[NLoci].Start=LSeq+1; Seq[LSeq]=0; if (NExons+10) { if (LSeq4) Seq[LSeq]=0; } else if (IQSeqPrint==0) { printf ("Sequence to long. Reduced to the first %li bp.\n",LSeq); IQSeqPrint=1; IQEof=-1; }; }; Z=fgetc(IFi); }; fseek(IFi,FilePos,0); --IGene; while (IQEof!=-1 && IQEof!=-2) { ++IGene; IQEof=GetCDSPos(IFi,Exon+NExons+1,IGene); if (IQEof>0 && ( (Exon[NExons].GeneEnd!=Exon[NExons+IQEof].End && Exon[NExons0].GeneStart!=Exon[NExons+1].Start && Exon[NExons].GeneStart!=Exon[NExons+IQEof].Start && Exon[NExons0].GeneEnd!=Exon[NExons+1].End) || NExons==-1 || strncmp(Exon[NExons].Locus,Locus,12)!=0 || Exon[NExons].End==Exon[NExons].Start || (Exon[NExons].End-Exon[NExons].Start)*(Exon[NExons+1].End-Exon[NExons+1].Start)<0 ) ) { if (NExons+IQEof0 && ( (Exon[NExons].GeneEnd==Exon[NExons+IQEof].End || Exon[NExons0].GeneStart==Exon[NExons+1].Start || Exon[NExons].GeneStart==Exon[NExons+IQEof].Start || Exon[NExons0].GeneEnd==Exon[NExons+1].End) && Exon[NExons+1].Start>0 && Exon[NExons+IQEof].End>0 && strcmp(Exon[NExons].Locus,Locus)==0 ) ) { printf ("\n*** A duplicate CDS ignored (%li).\n*** CDSs having the same start or end are considered duplicate.\n\n",Exon[NExons+1].Start); }; }; }; }; for (I=0;I<=NExons;++I) { LCDS+=abs(Exon[I].End-Exon[I].Start)+1; }; printf ("Sequence length: %li\n",LSeq); /* printf ("Number of genes: %li, number of exons: %li, total length of CDS: %li\n",Exon[NExons].Gene,NExons+1,LCDS); */ printf ("Total number of exons: %li, total length of CDS: %li\n",NExons-NLoci,LCDS); fclose (IFi); /* Sorting exons - only for randmodseq */ for (I=2;I<=NExons;++I) { J=I; while (Exon[J-1].Start>Exon[J].Start && J>=2) { PExon=Exon[J-1]; Exon[J-1]=Exon[J]; Exon[J]=PExon; --J; }; }; }; { /* Generating random sequence */ long GNT1[5],GNT2[5][5],GNT4[5][5][5][5],GNT6[5][5][5][5][5][5]; double FGNT0[5],FGNT1[5][5],FGNT3[5][5][5][5],FGNT5[5][5][5][5][5][5]; /* Markov transition probabilities */ long S0,S1[5],S3[5][5][5],S5[5][5][5][5][5]; long L,I,J; short I1,I2,I3,I4,I5,I6; char RSeq[MAXSEQLEN+2]; double R; char OFiName[MAXLINE]; char ZNT[6]; /* double F[5][5],FG[3][5][5]; long DNT[5][5],DNTG[3][5][5],S[5],SG[3][5],Start,End,LastEnd; long I1,I2,I3,K; */ FILE *OFi; printf ("Counting oligonucleoties ...\n"); strcpy (ZNT,"NACGT"); Seq[0]=Seq[LSeq]; S0=0; for (I1=0;I1<=4;++I1) { GNT1[I1]=0; S1[I1]=0; for (I2=0;I2<=4;++I2) { GNT2[I1][I2]=0; for (I3=0;I3<=4;++I3) { S3[I1][I2][I3]=0; for (I4=0;I4<=4;++I4) { GNT4[I1][I2][I3][I4]=0; for (I5=0;I5<=4;++I5) { S5[I1][I2][I3][I4][I5]=0; for (I6=0;I6<=4;++I6) { GNT6[I1][I2][I3][I4][I5][I6]=0; }; }; }; }; }; }; ++GNT1[Seq[1]]; ++GNT1[Seq[2]]; ++GNT1[Seq[3]]; ++GNT1[Seq[4]]; ++GNT1[Seq[5]]; ++GNT2[Seq[1]][Seq[2]]; ++GNT2[Seq[2]][Seq[3]]; ++GNT2[Seq[3]][Seq[4]]; ++GNT2[Seq[4]][Seq[5]]; ++GNT4[Seq[1]][Seq[2]][Seq[3]][Seq[4]]; ++GNT4[Seq[2]][Seq[3]][Seq[4]][Seq[5]]; for (L=6;L<=LSeq;++L) { ++GNT1[Seq[L]]; ++GNT2[Seq[L-1]][Seq[L]]; ++GNT4[Seq[L-3]][Seq[L-2]][Seq[L-1]][Seq[L]]; ++GNT6[Seq[L-5]][Seq[L-4]][Seq[L-3]][Seq[L-2]][Seq[L-1]][Seq[L]]; }; for (I1=1;I1<=4;++I1) { S0+=GNT1[I1]; for (I2=1;I2<=4;++I2) { S1[I1]+=GNT2[I1][I2]; for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { S3[I1][I2][I3]+=GNT4[I1][I2][I3][I4]; for (I5=1;I5<=4;++I5) { for (I6=1;I6<=4;++I6) { S5[I1][I2][I3][I4][I5]+=GNT6[I1][I2][I3][I4][I5][I6]; }; }; }; }; }; }; for (I1=1;I1<=4;++I1) { FGNT0[I1]=(double)(GNT1[I1])/S0; for (I2=1;I2<=4;++I2) { FGNT1[I1][I2]=(double)(GNT2[I1][I2])/S1[I1]; for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { FGNT3[I1][I2][I3][I4]=(double)(GNT4[I1][I2][I3][I4])/S3[I1][I2][I3]; for (I5=1;I5<=4;++I5) { for (I6=1;I6<=4;++I6) { FGNT5[I1][I2][I3][I4][I5][I6]=(double)(GNT6[I1][I2][I3][I4][I5][I6])/S5[I1][I2][I3][I4][I5]; }; }; }; }; }; }; FGNT0[2]+=FGNT0[1]; FGNT0[3]+=FGNT0[2]; FGNT0[4]+=FGNT0[3]; for (I1=1;I1<=4;++I1) { FGNT1[I1][2]+=FGNT1[I1][1]; FGNT1[I1][3]+=FGNT1[I1][2]; FGNT1[I1][4]+=FGNT1[I1][3]; for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { FGNT3[I1][I2][I3][2]+=FGNT3[I1][I2][I3][1]; FGNT3[I1][I2][I3][3]+=FGNT3[I1][I2][I3][2]; FGNT3[I1][I2][I3][4]+=FGNT3[I1][I2][I3][3]; for (I4=1;I4<=4;++I4) { for (I5=1;I5<=4;++I5) { FGNT5[I1][I2][I3][I4][I5][2]+=FGNT5[I1][I2][I3][I4][I5][1]; FGNT5[I1][I2][I3][I4][I5][3]+=FGNT5[I1][I2][I3][I4][I5][2]; FGNT5[I1][I2][I3][I4][I5][4]+=FGNT5[I1][I2][I3][I4][I5][3]; }; }; }; }; }; /* The following solves the problem of unknown bases (Seq[i] value 0) */ for (I1=0;I1<=4;++I1) { FGNT1[0][I1]=FGNT0[I1]; }; for (I1=0;I1<=4;++I1) { for (I2=0;I2<=4;++I2) { for (I3=0;I3<=4;++I3) { FGNT3[0][I3][I2][I1]=FGNT1[I2][I1]; FGNT3[I3][0][I2][I1]=FGNT1[I2][I1]; }; }; }; for (I1=0;I1<=4;++I1) { for (I2=0;I2<=4;++I2) { for (I3=0;I3<=4;++I3) { for (I4=0;I4<=4;++I4) { for (I5=0;I5<=4;++I5) { FGNT5[0][I5][I4][I3][I2][I1]=FGNT3[I4][I3][I2][I1]; FGNT5[I5][0][I4][I3][I2][I1]=FGNT3[I4][I3][I2][I1]; }; }; }; }; }; /* Bernoulli model */ printf ("Bernoulli ...\n"); srand48(atol(argv[3])); for (L=1;L<=LSeq;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (REnd) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+1End) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+1End) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+11) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R1) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=Start;L<=End;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R1) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (REnd) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+11) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R1) ++CNT2[I2][Seq[L+I2-1]][Seq[L+I2]]; }; }; for (I2=0;I2<=2;++I2) { CS[I2]=0; for (I1=1;I1<=4;++I1) { CS[I2]+=CNT[I2][I1]; }; CF[I2][1]=(double)(CNT[I2][1])/CS[I2]; CF[I2][2]=(double)(CNT[I2][1]+CNT[I2][2])/CS[I2]; CF[I2][3]=(double)(CNT[I2][1]+CNT[I2][2]+CNT[I2][3])/CS[I2]; CF[I2][4]=(double)(CNT[I2][1]+CNT[I2][2]+CNT[I2][3]+CNT[I2][4])/CS[I2]; }; for (I2=0;I2<=2;++I2) { for (I1=1;I1<=4;++I1) { CS2[I2][I1]=0; for (I3=1;I3<=4;++I3) { CS2[I2][I1]+=CNT2[I2][I1][I3]; }; if (CS2[I2][I1]>0) { CF2[I2][I1][1]=(double)(CNT2[I2][I1][1])/CS2[I2][I1]; CF2[I2][I1][2]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2])/CS2[I2][I1]; CF2[I2][I1][3]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2]+CNT2[I2][I1][3])/CS2[I2][I1]; CF2[I2][I1][4]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2]+CNT2[I2][I1][3]+CNT2[I2][I1][4])/CS2[I2][I1]; } else { CF2[I2][I1][1]=CF[I2][1]; CF2[I2][I1][2]=CF[I2][2]; CF2[I2][I1][3]=CF[I2][3]; CF2[I2][I1][4]=CF[I2][4]; }; }; }; for (L=Start;L<=End;L+=3) { for (I2=0;I2<=2;++I2) { R=drand48(); if (Seq[L+I2]==0) { RSeq[L+I2]=0; } else if (L+I2==1 || RSeq[L+I2-1]==0) { if (R1) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (REnd) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+11) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R=FC[I1][I2][I3]) { ++I3; if (I3>4) { I3=1; ++I2; if (I2>4) { I2=1; ++I1; }; }; }; RSeq[L]=I1; RSeq[L+1]=I2; RSeq[L+2]=I3; }; }; LastEnd=End; }; if (LastEnd+1<=LSeq) { for (I1=0;I1<=4;++I1) { NT[I1]=0; for (I2=0;I2<=4;++I2) { NT2[I1][I2]=0; }; }; for (L=LastEnd+1;L<=LSeq;++L) { ++NT[Seq[L]]; if (L>1) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (REnd) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+11) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R1) { ++CM[Seq[L-1]][Seq[L]][Seq[L+1]][Seq[L+2]]; }; }; S=0; for (I1=1;I1<=4;++I1) { for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { S+=C[I1][I2][I3]; }; }; }; for (I1=1;I1<=4;++I1) { S2[I1]=0; for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { S2[I1]+=CM[I1][I2][I3][I4]; }; }; }; }; R=0.0; for (I1=1;I1<=4;++I1) { for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { R+=C[I1][I2][I3]; FC[I1][I2][I3]=R/S; }; }; }; FC[4][4][4]=1.0; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { R=0.0; for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { R+=CM[I1][I2][I3][I4]; FCM[I1][I2][I3][I4]=R/S2[I1]; }; }; }; FCM[I1][4][4][4]=1.0; } else { for (I2=1;I2<=4;++I2) { for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { FCM[I1][I2][I3][I4]=FC[I2][I3][I4]; }; }; }; }; }; for (L=Start;L<=End;L+=3) { R=drand48(); if (Seq[L]==0 || Seq[L+1]==0 || Seq[L+2]==0) { RSeq[L]=Seq[L]; RSeq[L+1]=Seq[L+1]; RSeq[L+2]=Seq[L+2]; } else { if (L==1 || RSeq[L-1]==0) { I1=1; I2=1; I3=1; while (R>=FC[I1][I2][I3]) { ++I3; if (I3>4) { I3=1; ++I2; if (I2>4) { I2=1; ++I1; }; }; }; RSeq[L]=I1; RSeq[L+1]=I2; RSeq[L+2]=I3; } else { I4=RSeq[L-1]; I1=1; I2=1; I3=1; while (R>=FCM[I4][I1][I2][I3]) { ++I3; if (I3>4) { I3=1; ++I2; if (I2>4) { I2=1; ++I1; }; }; }; RSeq[L]=I1; RSeq[L+1]=I2; RSeq[L+2]=I3; }; }; }; LastEnd=End; }; if (LastEnd+1<=LSeq) { for (I1=0;I1<=4;++I1) { NT[I1]=0; for (I2=0;I2<=4;++I2) { NT2[I1][I2]=0; }; }; for (L=LastEnd+1;L<=LSeq;++L) { ++NT[Seq[L]]; if (L>1) ++NT2[Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (REnd) { Start=Exon[I].End; End=Exon[I].Start; }; if (LastEnd+11) ++NT2[Seq[L-1]][Seq[L]]; if (L>3) ++NT4[Seq[L-3]][Seq[L-2]][Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; for (I3=0;I3<=4;++I3) { S4[I1][I2][I3]=0; for (I4=0;I4<=4;++I4) { S4[I1][I2][I3]+=NT4[I1][I2][I3][I4]; }; }; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (I1=1;I1<=4;++I1) { for (I2=0;I2<=4;++I2) { for (I3=0;I3<=4;++I3) { if (S4[I1][I2][I3]>0) { ++S4[I1][I2][I3]; /* adding pseudocounts */ F4[I1][I2][I3][1]=((double)(NT4[I1][I2][I3][1])+F[1])/S4[I1][I2][I3]; F4[I1][I2][I3][2]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2])+F[2])/S4[I1][I2][I3]; F4[I1][I2][I3][3]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2]+NT4[I1][I2][I3][3])+F[3])/S4[I1][I2][I3]; F4[I1][I2][I3][4]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2]+NT4[I1][I2][I3][3]+NT4[I1][I2][I3][4])+F[4])/S4[I1][I2][I3]; } else { F4[I1][I2][I3][1]=F2[I3][1]; F4[I1][I2][I3][2]=F2[I3][2]; F4[I1][I2][I3][3]=F2[I3][3]; F4[I1][I2][I3][4]=F2[I3][4]; }; }; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R1) ++CNT2[I2][Seq[L+I2-1]][Seq[L+I2]]; if (L>3) ++CNT4[I2][Seq[L+I2-3]][Seq[L+I2-2]][Seq[L+I2-1]][Seq[L+I2]]; }; }; for (I2=0;I2<=2;++I2) { CS[I2]=0; for (I1=1;I1<=4;++I1) { CS[I2]+=CNT[I2][I1]; }; CF[I2][1]=(double)(CNT[I2][1])/CS[I2]; CF[I2][2]=(double)(CNT[I2][1]+CNT[I2][2])/CS[I2]; CF[I2][3]=(double)(CNT[I2][1]+CNT[I2][2]+CNT[I2][3])/CS[I2]; CF[I2][4]=(double)(CNT[I2][1]+CNT[I2][2]+CNT[I2][3]+CNT[I2][4])/CS[I2]; }; for (I2=0;I2<=2;++I2) { for (I1=1;I1<=4;++I1) { CS2[I2][I1]=0; for (I3=1;I3<=4;++I3) { CS2[I2][I1]+=CNT2[I2][I1][I3]; }; if (CS2[I2][I1]>0) { CF2[I2][I1][1]=(double)(CNT2[I2][I1][1])/CS2[I2][I1]; CF2[I2][I1][2]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2])/CS2[I2][I1]; CF2[I2][I1][3]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2]+CNT2[I2][I1][3])/CS2[I2][I1]; CF2[I2][I1][4]=(double)(CNT2[I2][I1][1]+CNT2[I2][I1][2]+CNT2[I2][I1][3]+CNT2[I2][I1][4])/CS2[I2][I1]; } else { CF2[I2][I1][1]=CF[I2][1]; CF2[I2][I1][2]=CF[I2][2]; CF2[I2][I1][3]=CF[I2][3]; CF2[I2][I1][4]=CF[I2][4]; }; }; }; for (I2=0;I2<=2;++I2) { for (I1=1;I1<=4;++I1) { for (I3=1;I3<=4;++I3) { for (I4=1;I4<=4;++I4) { CS4[I2][I1][I3][I4]=0; for (I5=1;I5<=4;++I5) { CS4[I2][I1][I3][I4]+=CNT4[I2][I1][I3][I4][I5]; }; if (CS4[I2][I1][I3][I4]>0) { ++CS4[I2][I1][I3][I4]; /* adding pseudocounts */ CF4[I2][I1][I3][I4][1]=((double)(CNT4[I2][I1][I3][I4][1])+CF[I2][1])/CS4[I2][I1][I3][I4]; CF4[I2][I1][I3][I4][2]=((double)(CNT4[I2][I1][I3][I4][1]+CNT4[I2][I1][I3][I4][2])+CF[I2][2])/CS4[I2][I1][I3][I4]; CF4[I2][I1][I3][I4][3]=((double)(CNT4[I2][I1][I3][I4][1]+CNT4[I2][I1][I3][I4][2]+CNT4[I2][I1][I3][I4][3])+CF[I2][3])/CS4[I2][I1][I3][I4]; CF4[I2][I1][I3][I4][4]=((double)(CNT4[I2][I1][I3][I4][1]+CNT4[I2][I1][I3][I4][2]+CNT4[I2][I1][I3][I4][3]+CNT4[I2][I1][I3][I4][4])+CF[I2][4])/CS4[I2][I1][I3][I4]; } else { CF4[I2][I1][I3][I4][1]=CF2[I2][I4][1]; CF4[I2][I1][I3][I4][2]=CF2[I2][I4][2]; CF4[I2][I1][I3][I4][3]=CF2[I2][I4][3]; CF4[I2][I1][I3][I4][4]=CF2[I2][I4][4]; }; }; }; }; }; for (L=Start;L<=End;L+=3) { for (I2=0;I2<=2;++I2) { R=drand48(); if (Seq[L+I2]==0) { RSeq[L+I2]=0; } else if (L+I2==1 || RSeq[L+I2-1]==0) { if (R1) ++NT2[Seq[L-1]][Seq[L]]; if (L>3) ++NT4[Seq[L-3]][Seq[L-2]][Seq[L-1]][Seq[L]]; }; S=0; for (I1=1;I1<=4;++I1) { S+=NT[I1]; S2[I1]=0; for (I2=0;I2<=4;++I2) { S2[I1]+=NT2[I1][I2]; for (I3=0;I3<=4;++I3) { S4[I1][I2][I3]=0; for (I4=0;I4<=4;++I4) { S4[I1][I2][I3]+=NT4[I1][I2][I3][I4]; }; }; }; }; F[1]=(double)(NT[1])/S; F[2]=(double)(NT[1]+NT[2])/S; F[3]=(double)(NT[1]+NT[2]+NT[3])/S; F[4]=(double)(NT[1]+NT[2]+NT[3]+NT[4])/S; for (I1=1;I1<=4;++I1) { if (S2[I1]>0) { F2[I1][1]=(double)(NT2[I1][1])/S2[I1]; F2[I1][2]=(double)(NT2[I1][1]+NT2[I1][2])/S2[I1]; F2[I1][3]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3])/S2[I1]; F2[I1][4]=(double)(NT2[I1][1]+NT2[I1][2]+NT2[I1][3]+NT2[I1][4])/S2[I1]; } else { F2[I1][1]=F[1]; F2[I1][2]=F[2]; F2[I1][3]=F[3]; F2[I1][4]=F[4]; }; }; for (I1=1;I1<=4;++I1) { for (I2=0;I2<=4;++I2) { for (I3=0;I3<=4;++I3) { if (S4[I1][I2][I3]>0) { ++S4[I1][I2][I3]; /* adding pseudocounts */ F4[I1][I2][I3][1]=((double)(NT4[I1][I2][I3][1])+F[1])/S4[I1][I2][I3]; F4[I1][I2][I3][2]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2])+F[2])/S4[I1][I2][I3]; F4[I1][I2][I3][3]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2]+NT4[I1][I2][I3][3])+F[3])/S4[I1][I2][I3]; F4[I1][I2][I3][4]=((double)(NT4[I1][I2][I3][1]+NT4[I1][I2][I3][2]+NT4[I1][I2][I3][3]+NT4[I1][I2][I3][4])+F[4])/S4[I1][I2][I3]; } else { F4[I1][I2][I3][1]=F2[I3][1]; F4[I1][I2][I3][2]=F2[I3][2]; F4[I1][I2][I3][3]=F2[I3][3]; F4[I1][I2][I3][4]=F2[I3][4]; }; }; }; }; for (L=LastEnd+1;L<=Start-1;++L) { R=drand48(); if (Seq[L]==0) { RSeq[L]=0; } else if (L==1 || RSeq[L-1]==0) { if (R