## Script to get variants from two sets of trimmed fastq files given a reference file.
## Parameters are recognized by their position.
## Usage:
## ./get_variants.sh REF LIBRARY_1 TYPE_1 PATH_1 LIBRARY_2 TYPE_2 PATH_2 INDIVIDUAL PLOIDY OUTDIR
## 1 REF -- Reference fasta file with fully qualified or relative path to that file.
## 2 LIBRARY_1 -- Name of the first libary, without extension.
## 3 TYPE_1 -- Type of the first libary, single or paired. 
##             If library is single-end read library, then the script will be looing for LIBRARY_1_trimmed.fastq file which is assumed to contain trimmed single-end reads.
##             If library is paired-end read library, then the script will be looing for LIBRARY_1.1P, LIBRARY_1.1P, LIBRARY_1.1P, LIBRARY_1.1P files which are assumed to contain trimmed paired-end forward and reverse reads.
## 4 PATH_1 -- Path to where trinned reads from the first library are.
## 5 LIBRARY_2 -- Name of the second libary, without extension.
## 6 TYPE_2 -- Type of the second libary, single or paired.
## 7 PATH_2 -- Path to where trinned reads from the second library are.
## 8 INDIVIDUAL -- Name of the individual to be used as prefic for output files.
## 9 PLOIDY -- ploidy level.
## 10 OUTDIR -- output diretory where resilts will be written. If does not exist -- it will be created.
##
## 
##
## Example 1:
## ./get_variants.sh ./vesca.new.ref.fa SRR5275181 single ./ SRR5275182 paired ./ vesca 2 vesca_round2 > get_variants_vesca_out 2> get_variants_vesca_err
##
## Example 2:
## ./get_variants.sh ./corimbosa.new.ref.fa SRR5275235 paired ./ SRR5275236 paired ./ corimbosa 4 corimbosa_round2 > get_variants_corimbosa_out 2> get_variants_corimbosa_err
##       



TIME=$(date +%c)
echo Start $TIME

echo "Setting environment"

## Set some environmental variables, mainly specofying path to tools needed
export PATH_PICARD=/home/olga/Downloads/picard-tools-1.96/
export PATH_GATK=~/Downloads/GenomeAnalysisTK-3.7/ 

export PATH=$PATH:/home/olga/Downloads/samtools-0.1.19/
export PATH=$PATH:/home/olga/Downloads/bwa-0.7.4/
export PATH=$PATH:/home/olga/Downloads/samtools-0.1.19/bcftools/


PATH_OUT=${10}

if [ ! -d ${PATH_OUT} ]; then
  mkdir ${PATH_OUT}
fi

REF_PATH_FILE=${1}
REF_PATH=${REF_PATH_FILE%/*}
REF_FILE=${REF_PATH_FILE##*/}
REF_NAME=${REF_FILE%.*}
REF_NAME_PATH=$REF_PATH/$REF_NAME

## Prepare reference
echo Preparing reference file ${REF_PATH_FILE}
bwa index $REF_PATH_FILE
samtools faidx $REF_PATH_FILE

if [ -f "$REF_NAME_PATH.dict" ]
then
	rm $REF_NAME_PATH.dict
fi
java -jar $PATH_PICARD/CreateSequenceDictionary.jar REFERENCE=$REF_PATH_FILE OUTPUT=$REF_NAME_PATH.dict









## Process first library with following steps:
## Get necessary names
## Map reads to the reference
## Write in sam format
## Write in bam format
## Sort reads
## Index sorted bam file

LIB=${2}
TYPE=${3}
PATH_IN=${4}


TIME=$(date +%T)
echo Processing ${TYPE}-end read library ${LIB}: $TIME

if [ $TYPE == "single" ]; then
	
	if [ ! -f "${PATH_IN}/${LIB}_trimmed.fastq" ]
	then
		echo "Script assumes that trimmed ${TYPE}-end reads are in ${PATH_IN}/${LIB}_trimmed.fastq file but it does not exist"
		exit 1
	fi

	
	bwa aln -f ${PATH_OUT}/${LIB}.sai $REF_PATH_FILE ${PATH_IN}/${LIB}_trimmed.fastq
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.sai ${PATH_IN}/${LIB}_trimmed.fastq > ${PATH_OUT}/${LIB}.sam
	samtools view -b -S -o ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sam
	samtools sort ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sorted
	samtools index ${PATH_OUT}/${LIB}.sorted.bam
	TO_MERGE_1="${PATH_OUT}/${LIB}.sorted.bam"

else
	if [ ! -f "${PATH_IN}/${LIB}.1P" ] || [ ! -f "${PATH_IN}/${LIB}.2P" ] || [ ! -f "${PATH_IN}/${LIB}.1U" ] || [ ! -f "${PATH_IN}/${LIB}.2U" ]
	then
		echo "Script assumes that trimmed ${TYPE}-end reads are in ${LIB}.1P, ${LIB}.2P, ${LIB}.1U, ${LIB}.2U files, located in ${PATH_IN} directory but at least one of them does not exist"
		exit 1
	fi
	

	bwa aln -f ${PATH_OUT}/${LIB}.1P.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.1P
	bwa aln -f ${PATH_OUT}/${LIB}.2P.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.2P
	bwa aln -f ${PATH_OUT}/${LIB}.1U.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.1U
	bwa aln -f ${PATH_OUT}/${LIB}.2U.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.2U

	bwa sampe -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.1P.sai ${PATH_OUT}/${LIB}.2P.sai ${PATH_IN}/${LIB}.1P ${PATH_IN}/${LIB}.2P > ${PATH_OUT}/${LIB}.sam
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.1U.sai ${PATH_IN}/${LIB}.1U > ${PATH_OUT}/${LIB}.1U.sam
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.2U.sai ${PATH_IN}/${LIB}.2U > ${PATH_OUT}/${LIB}.2U.sam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sam
	samtools sort ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sorted
	samtools index ${PATH_OUT}/${LIB}.sorted.bam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.1U.bam ${PATH_OUT}/${LIB}.1U.sam
	samtools sort ${PATH_OUT}/${LIB}.1U.bam ${PATH_OUT}/${LIB}.1U.sorted
	samtools index ${PATH_OUT}/${LIB}.1U.sorted.bam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.2U.bam ${PATH_OUT}/${LIB}.2U.sam
	samtools sort ${PATH_OUT}/${LIB}.2U.bam ${PATH_OUT}/${LIB}.2U.sorted
	samtools index ${PATH_OUT}/${LIB}.2U.sorted.bam
	TO_MERGE_1="${PATH_OUT}/${LIB}.sorted.bam ${PATH_OUT}/${LIB}.1U.sorted.bam ${PATH_OUT}/${LIB}.2U.sorted.bam"

fi



## Process second library:

LIB=${5}
TYPE=${6}
PATH_IN=${7}

TIME=$(date +%T)
echo Processing ${TYPE}-end read library ${LIB}: $TIME

if [ $TYPE == "single" ]; then
	
	if [ ! -f "${PATH_IN}/${LIB}_trimmed.fastq" ]
	then
		echo "Script assumes that trimmed ${TYPE}-end reads are in ${PATH_IN}/${LIB}_trimmed.fastq file but it does not exist"
		exit 1
	fi

	bwa aln -f ${PATH_OUT}/${LIB}.sai $REF_PATH_FILE ${PATH_IN}/${LIB}_trimmed.fastq
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.sai ${PATH_IN}/${LIB}_trimmed.fastq > ${PATH_OUT}/${LIB}.sam
	samtools view -b -S -o ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sam
	samtools sort ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sorted
	samtools index ${PATH_OUT}/${LIB}.sorted.bam
	TO_MERGE_2="${PATH_OUT}/${LIB}.sorted.bam"

else

	if [ ! -f "${PATH_IN}/${LIB}.1P" ] || [ ! -f "${PATH_IN}/${LIB}.2P" ] || [ ! -f "${PATH_IN}/${LIB}.1U" ] || [ ! -f "${PATH_IN}/${LIB}.2U" ]
	then
		echo "Script assumes that trimmed ${TYPE}-end reads are in ${LIB}.1P, ${LIB}.2P, ${LIB}.1U, ${LIB}.2U files, located in ${PATH_IN} directory but at least one of them does not exist"
		exit 1
	fi


	bwa aln -f ${PATH_OUT}/${LIB}.1P.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.1P
	bwa aln -f ${PATH_OUT}/${LIB}.2P.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.2P
	bwa aln -f ${PATH_OUT}/${LIB}.1U.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.1U
	bwa aln -f ${PATH_OUT}/${LIB}.2U.sai $REF_PATH_FILE ${PATH_IN}/${LIB}.2U

	bwa sampe -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.1P.sai ${PATH_OUT}/${LIB}.2P.sai ${PATH_IN}/${LIB}.1P ${PATH_IN}/${LIB}.2P > ${PATH_OUT}/${LIB}.sam
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.1U.sai ${PATH_IN}/${LIB}.1U > ${PATH_OUT}/${LIB}.1U.sam
	bwa samse -r '@RG\tID:foo\tSM:bar\tPL:ILLUMINA' $REF_PATH_FILE ${PATH_OUT}/${LIB}.2U.sai ${PATH_IN}/${LIB}.2U > ${PATH_OUT}/${LIB}.2U.sam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sam
	samtools sort ${PATH_OUT}/${LIB}.bam ${PATH_OUT}/${LIB}.sorted
	samtools index ${PATH_OUT}/${LIB}.sorted.bam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.1U.bam ${PATH_OUT}/${LIB}.1U.sam
	samtools sort ${PATH_OUT}/${LIB}.1U.bam ${PATH_OUT}/${LIB}.1U.sorted
	samtools index ${PATH_OUT}/${LIB}.1U.sorted.bam

	samtools view -b -S -o ${PATH_OUT}/${LIB}.2U.bam ${PATH_OUT}/${LIB}.2U.sam
	samtools sort ${PATH_OUT}/${LIB}.2U.bam ${PATH_OUT}/${LIB}.2U.sorted
	samtools index ${PATH_OUT}/${LIB}.2U.sorted.bam
	TO_MERGE_2="${PATH_OUT}/${LIB}.sorted.bam ${PATH_OUT}/${LIB}.1U.sorted.bam ${PATH_OUT}/${LIB}.2U.sorted.bam"

fi


INDIVIDUAL=${8}
PLOIDY=${9}

## Merge two libraries, sort and index again
TIME=$(date +%T)
echo Merging files two libraries, sorting and indexing them: $TIME
samtools merge -u ${PATH_OUT}/${INDIVIDUAL}.merge.bam ${TO_MERGE_1} ${TO_MERGE_2}
samtools sort ${PATH_OUT}/${INDIVIDUAL}.merge.bam ${PATH_OUT}/${INDIVIDUAL}.merge.sorted
samtools index ${PATH_OUT}/${INDIVIDUAL}.merge.sorted.bam

## Remove diplicated reads and index new bam file
TIME=$(date +%T)
echo Removing duplicated reads: $TIME
samtools rmdup -sS ${PATH_OUT}/${INDIVIDUAL}.merge.sorted.bam ${PATH_OUT}/${INDIVIDUAL}.dedup.bam
samtools index ${PATH_OUT}/${INDIVIDUAL}.dedup.bam


## Realign around indels 
TIME=$(date +%T)
echo Realigning around indels: $TIME
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T RealignerTargetCreator -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.dedup.bam -o ${PATH_OUT}/${INDIVIDUAL}.realigner.intervals
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T IndelRealigner -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.dedup.bam -targetIntervals ${PATH_OUT}/${INDIVIDUAL}.realigner.intervals -o ${PATH_OUT}/${INDIVIDUAL}.realigned.bam

## Call genotypes
TIME=$(date +%T)
echo Calling genotypes: $TIME
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T UnifiedGenotyper -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.realigned.bam -ploidy ${PLOIDY} -minIndelFrac 0.08 -minIndelCnt 2 -stand_call_conf 17.0 --genotype_likelihoods_model BOTH -maxAltAlleles 2 -deletions 0.1 -o ${PATH_OUT}/${INDIVIDUAL}.realigned.vcf



## Clean up vcf file and filter variants
perl util_1.pl ${PATH_OUT}/${INDIVIDUAL}.realigned.vcf ${PATH_OUT}/${INDIVIDUAL}.realigned.edited.vcf
perl util_genotype_filter_1.pl ${PATH_OUT}/${INDIVIDUAL}.realigned.edited.vcf ${PATH_OUT}/${INDIVIDUAL}.realigned.filterred.vcf

## Recalibrate basis, run genotyping again and clean up resulting files
TIME=$(date +%T)
echo Recalibrating bases, calling genotypes again and printing new reference: $TIME
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T BaseRecalibrator -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.realigned.bam -knownSites ${PATH_OUT}/${INDIVIDUAL}.realigned.filterred.vcf -o ${PATH_OUT}/${INDIVIDUAL}.recal_data.table
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T PrintReads -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.realigned.bam -BQSR ${PATH_OUT}/${INDIVIDUAL}.recal_data.table -o ${PATH_OUT}/${INDIVIDUAL}.calibrated.bam
java -jar $PATH_GATK/GenomeAnalysisTK.jar -T UnifiedGenotyper -R ${REF_PATH_FILE} -I ${PATH_OUT}/${INDIVIDUAL}.calibrated.bam -ploidy ${PLOIDY} -minIndelFrac 0.08 -minIndelCnt 2 -stand_call_conf 17.0 --genotype_likelihoods_model BOTH -maxAltAlleles 2 -deletions 0.1 -o ${PATH_OUT}/${INDIVIDUAL}.calibrated.vcf
perl util_1.pl ${PATH_OUT}/${INDIVIDUAL}.calibrated.vcf ${PATH_OUT}/${INDIVIDUAL}.calibrated.edited.vcf
perl util_genotype_filter_1.pl ${PATH_OUT}/${INDIVIDUAL}.calibrated.edited.vcf ${PATH_OUT}/${INDIVIDUAL}.calibrated.filterred.vcf

java -jar $PATH_GATK/GenomeAnalysisTK.jar -T FastaAlternateReferenceMaker -R ${REF_PATH_FILE} -V ${PATH_OUT}/${INDIVIDUAL}.calibrated.filterred.vcf -o ${PATH_OUT}/${INDIVIDUAL}.new.ref.fa


## Get stats on coverage
TIME=$(date +%T)
echo Getting sequencing coverage: $TIME
samtools  mpileup -uf ${PATH_OUT}/${INDIVIDUAL}.new.ref.fa ${PATH_OUT}/${INDIVIDUAL}.calibrated.bam > ${PATH_OUT}/${INDIVIDUAL}.all_sites.calibrated.bcf
bcftools view -cg ${PATH_OUT}/${INDIVIDUAL}.all_sites.calibrated.bcf > ${PATH_OUT}/${INDIVIDUAL}.all_sites.calibrated.vcf



TIME=$(date +%c)
echo Done: $TIME
