#### all lines starting with # are comment lines  #####


#I'd highly highly recommend you to continue working on project 2 before our Thursday's class. Since I've made my command list (http://cys.bios.niu.edu/yyin/teach/PBB/cmd.txt) available to you, you could do it outside of class. On Thursday, I will have to go faster. If you can finish all the command lines beforehand, it will be much easier for all of us on Thursday. Then I will focus on to explain how you can complete the tables and figures. Otherwise based on today's progress, we won't be able to finish all the command lines in our very last class.

#For step 3:
#To avoid any troubles, as I mentioned today, you want to follow my command one-liners in that file. If you put the concatenated file in the individual genome folder (e.g. Escherichia_coli_K_12_substr__MG1655_uid57779), you will start to have problems in all the following steps. You want to use your project2 folder as your working directory. All the command one-lines are run in your project2 folder.

#For step 4:
#You might see warnings like: Selenocysteine (U) at position ** replaced by X. Maybe some other msgs too. No need to worry about that. See here for some explaination: https://www.biostars.org/p/111143/

#If you don't want to see these msgs, you can put 2>err at the end of the command line. I have updated my cmds in http://cys.bios.niu.edu/yyin/teach/PBB/cmd.txt. If you have already finished this step 4, no need to redo it.

#For step 5:
#I added a e-value filter step [awk '$11<1e-5']. Sorry I just realized that we have to use this filter because the blastp default is 10, which is too loose.


#### step 1 and 2
mkdir project2
cd project2
ls -l
lftp ftp.ncbi.nih.gov
  
#### step 3
ls -l Escherichia_coli_042_uid161985/
ls -l Escherichia_coli_042_uid161985/*faa
cat Escherichia_coli_042_uid161985/*faa > 042.faa
cat Escherichia_coli_536_uid58531/NC_008253.faa > 536.faa
cat Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa > mg1655.faa

ls -l

#### step 4
makeblastdb -in 042.faa -dbtype prot -parse_seqids
makeblastdb -in 536.faa -dbtype prot -parse_seqids
makeblastdb -in mg1655.faa -dbtype prot -parse_seqids

ls -ltr

time blastp -query 042.faa -db 536.faa -out 042-536.out -outfmt 6 2>err &
time blastp -query 042.faa -db mg1655.faa -out 042-mg1655.out -outfmt 6 2>err &
time blastp -query 536.faa -db mg1655.faa -out 536-mg1655.out -outfmt 6 2>err &
time blastp -query 536.faa -db 042.faa -out 536-042.out -outfmt 6 2>err &
time blastp -query mg1655.faa -db 042.faa -out mg1655-042.out -outfmt 6 2>err &
time blastp -query mg1655.faa -db 536.faa -out mg1655-536.out -outfmt 6 2>err &

jobs

#### step 5
less 042-536.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-536.out.id
less 042-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 042-mg1655.out.id
less 536-042.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-042.out.id
less 536-mg1655.out | awk '$11<1e-5' | cut -f1 | sort -u > 536-mg1655.out.id
less mg1655-042.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-042.out.id
less mg1655-536.out | awk '$11<1e-5' | cut -f1 | sort -u > mg1655-536.out.id

cat 042.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 042.faa.id
cat 536.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > 536.faa.id
cat mg1655.faa | grep '>' | sed 's/>//' | cut -f1 -d' ' | sort -u > mg1655.faa.id

# count how many proteins in each genome; these numbers go to Table 1
# check slide 1 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
wc -l *.faa.id

# count how many proteins in each genome having hits in the other two genomes 
# for example, 042-536.out.id has all the 042 protein IDs having hits in 536
# check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
# these numbers also go to Table 1
wc -l *.out.id

# how many common IDs
comm -12 042.faa.id 042-536.out.id | wc -l
# how many are unique IDs in 042-536.out.id; this should be 0
comm -13 042.faa.id 042-536.out.id | wc -l

# how many are unique IDs in 042.faa.id 
# these are number of 042 proteins that do not have hits in 536
# they are also called ORFans based on comparison of 042 vs 536
# check slide 2 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
comm -23 042.faa.id 042-536.out.id | wc -l

# save all orfans IDs resulted from different comparisons
comm -23 042.faa.id 042-536.out.id > 042-536.out.id.orfan
comm -23 042.faa.id 042-mg1655.out.id > 042-mg1655.out.id.orfan
comm -23 536.faa.id 536-mg1655.out.id > 536-mg1655.out.id.orfan
comm -23 536.faa.id 536-042.out.id > 536-042.out.id.orfan
comm -23 mg1655.faa.id mg1655-042.out.id > mg1655-042.out.id.orfan
comm -23 mg1655.faa.id mg1655-536.out.id > mg1655-536.out.id.orfan

# count orfans in the six pair-wise comparisons
# check slide 3 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
wc -l *orfan


# this is to find out how many 042 orfans are shared between 042-536 comparison and 042-mg1655 comparison
# that is, how many 042 proteins do not have hits in both 536 and mg1655
# check slide 4 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l

# the follow two cmds could be ignored
comm -13 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l
comm -23 042-536.out.id.orfan 042-mg1655.out.id.orfan | wc -l

# save the orfan IDs
# check slide 5 in http://cys.bios.niu.edu/yyin/teach/PBB/project2-venndiagram.pdf
comm -12 042-536.out.id.orfan 042-mg1655.out.id.orfan > 042.orfan
comm -12 536-042.out.id.orfan 536-mg1655.out.id.orfan > 536.orfan
comm -12 mg1655-042.out.id.orfan mg1655-536.out.id.orfan > mg1655.orfan

#### step 6
wget -q http://cys.bios.niu.edu/orfanfinder/files/ORFanFinder.tar.gz
ls -ltr

tar zxf ORFanFinder.tar.gz
g++ -std=c++0x ORFanFinder.cc -o ORFanFinder

./ORFanFinder /disk1/ecoli/Escherichia_coli_K_12_substr__MG1655_uid57779.out 511145 nodes ncbiBact.tax > mg1655.list &

#### step 7
less mg1655.list

# this is to count how many orfans of different ages, which go to Table 2, col 2
less mg1655.list | cut -f2 | sort | uniq -c

# save the gi numbers of different orfans
less mg1655.list | grep strict | sed 's/|/\t/g' | cut -f2 > mg1655.list.strict.gi
less mg1655.list | grep species | sed 's/|/\t/g' | cut -f2 > mg1655.list.species.gi
less mg1655.list | grep genus | sed 's/|/\t/g' | cut -f2 > mg1655.list.genus.gi
less mg1655.list | grep family | sed 's/|/\t/g' | cut -f2 > mg1655.list.family.gi
less mg1655.list | grep class | sed 's/|/\t/g' | cut -f2 > mg1655.list.class.gi
less mg1655.list | grep phylum | sed 's/|/\t/g' | cut -f2 > mg1655.list.phylum.gi

# this is to extract the gi number of mg1655 orfans from step 5
less mg1655.orfan | cut -f2 -d'|' | sort > mg1655.orfan.gi
sort mg1655.list.strict.gi > mg1655.list.strict.gi.sort


# this is how many orfans in mg1655.orfan.gi are no longer orfans in mg1655.list.strict.gi
# answers question in step 7
comm -13 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l
# the following two cmds could be ignored
comm -12 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l
comm -23 mg1655.list.strict.gi.sort mg1655.orfan.gi | wc -l

#### step 8: the xls files will be transferred to your Windows computer as Table S1 to S6
grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt
grep -f mg1655.list.strict.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.strict.gi.ptt.xls
grep -f mg1655.list.species.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.species.gi.ptt.xls
grep -f mg1655.list.genus.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.genus.gi.ptt.xls
grep -f mg1655.list.family.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.family.gi.ptt.xls
grep -f mg1655.list.class.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.class.gi.ptt.xls
grep -f mg1655.list.phylum.gi Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ptt > mg1655.list.phylum.gi.ptt.xls

#### step 9
grep uncharacterized mg1655.list.species.gi.ptt.xls
# the following counts go to Table 2: species orfans
grep uncharacterized mg1655.list.species.gi.ptt.xls | wc -l
grep phage mg1655.list.species.gi.ptt.xls | wc -l
grep transposase mg1655.list.species.gi.ptt.xls | wc -l

# the following counts go to Table 2: strict orfans
grep uncharacterized mg1655.list.strict.gi.ptt.xls | wc -l
grep phage mg1655.list.strict.gi.ptt.xls | wc -l
grep transposase mg1655.list.strict.gi.ptt.xls | wc -l

# the following counts go to Table 2: genus orfans
grep uncharacterized mg1655.list.genus.gi.ptt.xls | wc -l
grep phage mg1655.list.genus.gi.ptt.xls | wc -l
grep transposase mg1655.list.genus.gi.ptt.xls | wc -l

# the following counts go to Table 2: family orfans
grep uncharacterized mg1655.list.family.gi.ptt.xls | wc -l
grep phage mg1655.list.family.gi.ptt.xls | wc -l
grep transposase mg1655.list.family.gi.ptt.xls | wc -l

# the following counts go to Table 2: class orfans
grep uncharacterized mg1655.list.class.gi.ptt.xls | wc -l
grep phage mg1655.list.class.gi.ptt.xls | wc -l
grep transposase mg1655.list.class.gi.ptt.xls | wc -l

# the following counts go to Table 2: phylum orfans
grep uncharacterized mg1655.list.phylum.gi.ptt.xls | wc -l
grep phage mg1655.list.phylum.gi.ptt.xls | wc -l
grep transposase mg1655.list.phylum.gi.ptt.xls | wc -l