#### all lines starting with # are comment lines ##### #For step 3: #To avoid any troubles, as I mentioned today, you want to follow my command one-liners in that file. If you put the concatenated file in the individual genome folder (e.g. Escherichia_coli_K_12_substr__MG1655_uid57779), you will start to have problems in all the following steps. You want to use your project2 folder as your working directory. All the command one-lines are run in your project2 folder. #For step 4: #You might see warnings like: Selenocysteine (U) at position ** replaced by X. Maybe some other msgs too. No need to worry about that. See here for some explaination: https://www.biostars.org/p/111143/ #If you don't want to see these msgs, you can put 2>err at the end of the command line. I have updated my cmds in http://cys.bios.niu.edu/yyin/teach/PBB/cmd.txt. If you have already finished this step 4, no need to redo it. #For step 5: #I added a e-value filter step [awk '$11<1e-5']. Sorry I just realized that we have to use this filter because the blastp default is 10, which is too loose. #### step 1 and 2 mkdir project2 cd project2 ls -l lftp ftp.ncbi.nih.gov # NOTE THAT NCBI MOVED Bacteria folder to ftp.ncbi.nih.gov:/genomes/archive/old_refseq/Bacteria/ on 12/03/2015 #### step 3 ls -l Escherichia_coli_042_uid161985/ ls -l Escherichia_coli_042_uid161985/*faa cat Escherichia_coli_042_uid161985/*faa > 042.faa cat Escherichia_coli_536_uid58531/NC_008253.faa > 536.faa cat Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa > mg1655.faa ls -l #### step 4 makeblastdb -in 042.faa -dbtype prot -parse_seqids makeblastdb -in 536.faa -dbtype prot -parse_seqids makeblastdb -in mg1655.faa -dbtype prot -parse_seqids ls -ltr time blastp -query 042.faa -db 536.faa -out 042-536.out -outfmt 6 2>err & time blastp -query 042.faa -db mg1655.faa -out 042-mg1655.out -outfmt 6 2>err & time blastp -query 536.faa -db mg1655.faa -out 536-mg1655.out -outfmt 6 2>err & time blastp -query 536.faa -db 042.faa -out 536-042.out -outfmt 6 2>err & time blastp -query mg1655.faa -db 042.faa -out mg1655-042.out -outfmt 6 2>err & time blastp -query mg1655.faa -db 536.faa -out mg1655-536.out -outfmt 6 2>err & jobs #### step 5 less 042-536.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-536.out.id less 042-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-mg1655.out.id less 536-042.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-042.out.id less 536-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-mg1655.out.id less mg1655-042.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-042.out.id less mg1655-536.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-536.out.id cat 042.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 042.faa.id cat 536.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 536.faa.id cat mg1655.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > mg1655.faa.id # count how many proteins in each genome; these numbers go to Table 1 # check slide 1 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf wc -l *.faa.id # count how many proteins in each genome having hits in the other two genomes # for example, 042-536.out.id has all the 042 protein IDs having hits in 536 # check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf # these numbers also go to Table 1 wc -l *.out.id # how many common IDs comm -12 042.faa.id 042-536.out.id | wc -l # how many are unique IDs in 042-536.out.id; this should be 0 comm -13 042.faa.id 042-536.out.id | wc -l # how many are unique IDs in 042.faa.id # these are number of 042 proteins that do not have hits in 536 # they are also called ORFans based on comparison of 042 vs 536 # check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -23 042.faa.id 042-536.out.id | wc -l # save all orfans IDs resulted from different comparisons comm -23 042.faa.id 042-536.out.id > 042-536.out.id.orfan comm -23 042.faa.id 042-mg1655.out.id > 042-mg1655.out.id.orfan comm -23 536.faa.id 536-mg1655.out.id > 536-mg1655.out.id.orfan comm -23 536.faa.id 536-042.out.id > 536-042.out.id.orfan comm -23 mg1655.faa.id mg1655-042.out.id > mg1655-042.out.id.orfan comm -23 mg1655.faa.id mg1655-536.out.id > mg1655-536.out.id.orfan # count orfans in the six pair-wise comparisons # check slide 3 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf wc -l *orfan # this is to find out how many 042 orfans are shared between 042-536 comparison and 042-mg1655 comparison # that is, how many 042 proteins do not have hits in both 536 and mg1655 # check slide 4 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l # the following two cmds could be ignored comm -13 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l comm -23 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l # save the orfan IDs # check slide 5 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan > 042.orfan comm -12 536-042.out.id.orfan 536-mg1655.out.id.orfan > 536.orfan comm -12 mg1655-042.out.id.orfan mg1655-536.out.id.orfan > mg1655.orfan #### step 6 wget ftp://cys.bios.niu.edu/orfanfinder/ORFanFinder.tar.gz ls -ltr tar zxf ORFanFinder.tar.gz # you must read README and INSTALL files to understand what the following cmds are doing ./configure --prefix=/disk2/xiaohan/ORFanFinder-1.1.0 make make install make db cat mg1655.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' > mg1655.faa.fullid ./ORFanFinder -query /disk1/ecoli/ecoliNRblast.bl -id mg1655.faa.fullid -tax 511145 -nodes $HOME/tools/ORFanFinder-1.1.0/inputData/nodes -db $HOME/tools/ORFanFinder-1.1.0/databases/nr.hdb -out mg1655.list & #### step 7 less mg1655.list # this is to count how many orfans of different ages, which go to Table 2, col 2 less mg1655.list | cut -f2 | sort | uniq -c # save the gi numbers of different orfans less mg1655.list | grep strict | sed 's/|/\t/g' | cut -f2 > mg1655.list.strict.gi less mg1655.list | grep species | sed 's/|/\t/g' | cut -f2 > mg1655.list.species.gi less mg1655.list | grep genus | sed 's/|/\t/g' | cut -f2 > mg1655.list.genus.gi less mg1655.list | grep family | sed 's/|/\t/g' | cut -f2 > mg1655.list.family.gi less mg1655.list | grep class | sed 's/|/\t/g' | cut -f2 > mg1655.list.class.gi less mg1655.list | grep phylum | sed 's/|/\t/g' | cut -f2 > mg1655.list.phylum.gi # this is to extract the gi number of mg1655 orfans from step 5 less mg1655.orfan | cut -f2 -d'|' | sort > mg1655.orfan.gi sort mg1655.list.strict.gi > mg1655.list.strict.gi.sort # this is how many orfans in mg1655.orfan.gi are no longer orfans in mg1655.list.strict.gi # answers question in step 7 comm -13 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l # the following two cmds could be ignored comm -12 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l comm -23 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l #### step 8: the xls files will be transferred to your Windows computer as Table S1 to S6 grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.strict.gi.ptt.xls grep -f mg1655.list.species.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.species.gi.ptt.xls grep -f mg1655.list.genus.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.genus.gi.ptt.xls grep -f mg1655.list.family.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.family.gi.ptt.xls grep -f mg1655.list.class.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.class.gi.ptt.xls grep -f mg1655.list.phylum.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.phylum.gi.ptt.xls #### step 9 grep uncharacterized mg1655.list.species.gi.ptt.xls # the following counts go to Table 2: species orfans grep uncharacterized mg1655.list.species.gi.ptt.xls | wc -l grep phage mg1655.list.species.gi.ptt.xls | wc -l grep transposase mg1655.list.species.gi.ptt.xls | wc -l # the following counts go to Table 2: strict orfans grep uncharacterized mg1655.list.strict.gi.ptt.xls | wc -l grep phage mg1655.list.strict.gi.ptt.xls | wc -l grep transposase mg1655.list.strict.gi.ptt.xls | wc -l # the following counts go to Table 2: genus orfans grep uncharacterized mg1655.list.genus.gi.ptt.xls | wc -l grep phage mg1655.list.genus.gi.ptt.xls | wc -l grep transposase mg1655.list.genus.gi.ptt.xls | wc -l # the following counts go to Table 2: family orfans grep uncharacterized mg1655.list.family.gi.ptt.xls | wc -l grep phage mg1655.list.family.gi.ptt.xls | wc -l grep transposase mg1655.list.family.gi.ptt.xls | wc -l # the following counts go to Table 2: class orfans grep uncharacterized mg1655.list.class.gi.ptt.xls | wc -l grep phage mg1655.list.class.gi.ptt.xls | wc -l grep transposase mg1655.list.class.gi.ptt.xls | wc -l # the following counts go to Table 2: phylum orfans grep uncharacterized mg1655.list.phylum.gi.ptt.xls | wc -l grep phage mg1655.list.phylum.gi.ptt.xls | wc -l grep transposase mg1655.list.phylum.gi.ptt.xls | wc -l