#### all lines starting with # are comment lines ##### #I'd highly highly recommend you to continue working on project 2 before our Thursday's class. Since I've made my command list (http://cys.bios.niu.edu/yyin/teach/PBB/cmd.txt) available to you, you could do it outside of class. On Thursday, I will have to go faster. If you can finish all the command lines beforehand, it will be much easier for all of us on Thursday. Then I will focus on to explain how you can complete the tables and figures. Otherwise based on today's progress, we won't be able to finish all the command lines in our very last class. #For step 3: #To avoid any troubles, as I mentioned today, you want to follow my command one-liners in that file. If you put the concatenated file in the individual genome folder (e.g. Escherichia_coli_K_12_substr__MG1655_uid57779), you will start to have problems in all the following steps. You want to use your project2 folder as your working directory. All the command one-lines are run in your project2 folder. #For step 4: #You might see warnings like: Selenocysteine (U) at position ** replaced by X. Maybe some other msgs too. No need to worry about that. See here for some explaination: https://www.biostars.org/p/111143/ #If you don't want to see these msgs, you can put 2>err at the end of the command line. I have updated my cmds in http://cys.bios.niu.edu/yyin/teach/PBB/cmd.txt. If you have already finished this step 4, no need to redo it. #For step 5: #I added a e-value filter step [awk '$11<1e-5']. Sorry I just realized that we have to use this filter because the blastp default is 10, which is too loose. #### step 1 and 2 mkdir project2 cd project2 ls -l lftp ftp.ncbi.nih.gov #### step 3 ls -l Escherichia_coli_042_uid161985/ ls -l Escherichia_coli_042_uid161985/*faa cat Escherichia_coli_042_uid161985/*faa > 042.faa cat Escherichia_coli_536_uid58531/NC_008253.faa > 536.faa cat Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa > mg1655.faa ls -l #### step 4 makeblastdb -in 042.faa -dbtype prot -parse_seqids makeblastdb -in 536.faa -dbtype prot -parse_seqids makeblastdb -in mg1655.faa -dbtype prot -parse_seqids ls -ltr time blastp -query 042.faa -db 536.faa -out 042-536.out -outfmt 6 2>err & time blastp -query 042.faa -db mg1655.faa -out 042-mg1655.out -outfmt 6 2>err & time blastp -query 536.faa -db mg1655.faa -out 536-mg1655.out -outfmt 6 2>err & time blastp -query 536.faa -db 042.faa -out 536-042.out -outfmt 6 2>err & time blastp -query mg1655.faa -db 042.faa -out mg1655-042.out -outfmt 6 2>err & time blastp -query mg1655.faa -db 536.faa -out mg1655-536.out -outfmt 6 2>err & jobs #### step 5 less 042-536.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-536.out.id less 042-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-mg1655.out.id less 536-042.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-042.out.id less 536-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-mg1655.out.id less mg1655-042.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-042.out.id less mg1655-536.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-536.out.id cat 042.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 042.faa.id cat 536.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 536.faa.id cat mg1655.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > mg1655.faa.id # count how many proteins in each genome; these numbers go to Table 1 # check slide 1 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf wc -l *.faa.id # count how many proteins in each genome having hits in the other two genomes # for example, 042-536.out.id has all the 042 protein IDs having hits in 536 # check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf # these numbers also go to Table 1 wc -l *.out.id # how many common IDs comm -12 042.faa.id 042-536.out.id | wc -l # how many are unique IDs in 042-536.out.id; this should be 0 comm -13 042.faa.id 042-536.out.id | wc -l # how many are unique IDs in 042.faa.id # these are number of 042 proteins that do not have hits in 536 # they are also called ORFans based on comparison of 042 vs 536 # check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -23 042.faa.id 042-536.out.id | wc -l # save all orfans IDs resulted from different comparisons comm -23 042.faa.id 042-536.out.id > 042-536.out.id.orfan comm -23 042.faa.id 042-mg1655.out.id > 042-mg1655.out.id.orfan comm -23 536.faa.id 536-mg1655.out.id > 536-mg1655.out.id.orfan comm -23 536.faa.id 536-042.out.id > 536-042.out.id.orfan comm -23 mg1655.faa.id mg1655-042.out.id > mg1655-042.out.id.orfan comm -23 mg1655.faa.id mg1655-536.out.id > mg1655-536.out.id.orfan # count orfans in the six pair-wise comparisons # check slide 3 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf wc -l *orfan # this is to find out how many 042 orfans are shared between 042-536 comparison and 042-mg1655 comparison # that is, how many 042 proteins do not have hits in both 536 and mg1655 # check slide 4 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l # the follow two cmds could be ignored comm -13 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l comm -23 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l # save the orfan IDs # check slide 5 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan > 042.orfan comm -12 536-042.out.id.orfan 536-mg1655.out.id.orfan > 536.orfan comm -12 mg1655-042.out.id.orfan mg1655-536.out.id.orfan > mg1655.orfan #### step 6 wget -q http://cys.bios.niu.edu/orfanfinder/files/ORFanFinder.tar.gz ls -ltr tar zxf ORFanFinder.tar.gz g++ -std=c++0x ORFanFinder.cc -o ORFanFinder ./ORFanFinder /disk1/ecoli/Escherichia_coli_K_12_substr__MG1655_uid57779.out 511145 nodes ncbiBact.tax > mg1655.list & #### step 7 less mg1655.list # this is to count how many orfans of different ages, which go to Table 2, col 2 less mg1655.list | cut -f2 | sort | uniq -c # save the gi numbers of different orfans less mg1655.list | grep strict | sed 's/|/\t/g' | cut -f2 > mg1655.list.strict.gi less mg1655.list | grep species | sed 's/|/\t/g' | cut -f2 > mg1655.list.species.gi less mg1655.list | grep genus | sed 's/|/\t/g' | cut -f2 > mg1655.list.genus.gi less mg1655.list | grep family | sed 's/|/\t/g' | cut -f2 > mg1655.list.family.gi less mg1655.list | grep class | sed 's/|/\t/g' | cut -f2 > mg1655.list.class.gi less mg1655.list | grep phylum | sed 's/|/\t/g' | cut -f2 > mg1655.list.phylum.gi # this is to extract the gi number of mg1655 orfans from step 5 less mg1655.orfan | cut -f2 -d'|' | sort > mg1655.orfan.gi sort mg1655.list.strict.gi > mg1655.list.strict.gi.sort # this is how many orfans in mg1655.orfan.gi are no longer orfans in mg1655.list.strict.gi # answers question in step 7 comm -13 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l # the following two cmds could be ignored comm -12 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l comm -23 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l #### step 8: the xls files will be transferred to your Windows computer as Table S1 to S6 grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.strict.gi.ptt.xls grep -f mg1655.list.species.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.species.gi.ptt.xls grep -f mg1655.list.genus.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.genus.gi.ptt.xls grep -f mg1655.list.family.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.family.gi.ptt.xls grep -f mg1655.list.class.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.class.gi.ptt.xls grep -f mg1655.list.phylum.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.phylum.gi.ptt.xls #### step 9 grep uncharacterized mg1655.list.species.gi.ptt.xls # the following counts go to Table 2: species orfans grep uncharacterized mg1655.list.species.gi.ptt.xls | wc -l grep phage mg1655.list.species.gi.ptt.xls | wc -l grep transposase mg1655.list.species.gi.ptt.xls | wc -l # the following counts go to Table 2: strict orfans grep uncharacterized mg1655.list.strict.gi.ptt.xls | wc -l grep phage mg1655.list.strict.gi.ptt.xls | wc -l grep transposase mg1655.list.strict.gi.ptt.xls | wc -l # the following counts go to Table 2: genus orfans grep uncharacterized mg1655.list.genus.gi.ptt.xls | wc -l grep phage mg1655.list.genus.gi.ptt.xls | wc -l grep transposase mg1655.list.genus.gi.ptt.xls | wc -l # the following counts go to Table 2: family orfans grep uncharacterized mg1655.list.family.gi.ptt.xls | wc -l grep phage mg1655.list.family.gi.ptt.xls | wc -l grep transposase mg1655.list.family.gi.ptt.xls | wc -l # the following counts go to Table 2: class orfans grep uncharacterized mg1655.list.class.gi.ptt.xls | wc -l grep phage mg1655.list.class.gi.ptt.xls | wc -l grep transposase mg1655.list.class.gi.ptt.xls | wc -l # the following counts go to Table 2: phylum orfans grep uncharacterized mg1655.list.phylum.gi.ptt.xls | wc -l grep phage mg1655.list.phylum.gi.ptt.xls | wc -l grep transposase mg1655.list.phylum.gi.ptt.xls | wc -l