0. 环境配置
cd /data/chaofan/software/
git clone https://github.com/stschiff/msmc2.git
cd msmc2
# dmd
wget http://downloads.dlang.org/releases/2.x/2.094.2/dmd.2.094.2.linux.tar.xz
xz -d dmd.2.094.2.linux.tar.xz ; tar -xvf dmd.2.094.2.linux.tar
echo 'export PATH="/data/chaofan/software/msmc2/dmd2/linux/bin64:$PATH"' >> ~/.bashrc
source ~/.bashrc
# gsl
wget http://mirror.rit.edu/gnu/gsl/gsl-2.6.tar.gz
tar -pzxvf gsl-2.6.tar.gz
cd gsl-2.6
./configure --prefix=/data/chaofan/software/msmc2/gsl
make
make install
cd ..
# make msmc2
## 修改原始的SLDIR, 改成你安装的
vi Makefile
'
SLDIR=/data/chaofan/software/msmc2/gsl/lib
'
make
如出现错误:
x86_64-conda-linux-gnu/lib/../lib64/libgcc_s.so.1: undefined reference to
memcpy@GLIBC_2.14' collect2: error: ld returned 1 exit status Error: linker exited with status 1 make: *** [Makefile:20:build/release/msmc2] error1 退出conda环境再
make`
conda deactivate
make
# 下载配套工具
git clone https://github.com/stschiff/msmc-tools.git
git clone https://github.com/jessicarick/msmc2_scripts.git
# mask file
wget http://lh3lh3.users.sourceforge.net/download/seqbility-20091110.tar.bz2
tar jxfv seqbility-20091110.tar.bz2
cd seqbility-20091110
make
cd ..
# 下载测试数据 手动下载
"https://share.eva.mpg.de/index.php/s/swpTM4mNK7gG7nw/"
1. 测试 MSMC2
1.1 Generating consensus sequences for each sample
# 把网页下载的数据拖到这里
cd /data/chaofan/software/msmc2 ; mkdir TEST_data ; cd TEST_data
unzip MSMC-tutorial-files.zip ; cd MSMC-tutorial-files
现在开始测试,一共 6 个样本。
mkdir -p ./CF_TRY/consensus_calls
MASTERVARDIR=./cg_data
OUTDIR=./CF_TRY/consensus_calls
CHR=chr1
for IND in NA19238 NA19239 NA19240 NA12878 NA12891 NA12892; do
MASTERVAR=$(ls $MASTERVARDIR/masterVarBeta-$IND-*.tsv.chr1.bz2)
OUT_MASK=$OUTDIR/$IND.$CHR.mask.bed.gz
OUT_VCF=$OUTDIR/$IND.$CHR.vcf.gz
/data/chaofan/software/msmc2/msmc-tools/cgCaller.py $CHR $IND $OUT_MASK $MASTERVAR | gzip -c > $OUT_VCF
done
1.2 Combining samples
mkdir -p ./CF_TRY/msmc2_input
INDIR=./CF_TRY/consensus_calls
OUTDIR=./CF_TRY/msmc2_input
MAPDIR=./
/data/chaofan/software/msmc2/msmc-tools/generate_multihetsep.py --chr 1 \
--mask $INDIR/NA12878.chr1.mask.bed.gz --mask $INDIR/NA12891.chr1.mask.bed.gz --mask $INDIR/NA12892.chr1.mask.bed.gz \
--mask $INDIR/NA19240.chr1.mask.bed.gz --mask $INDIR/NA19238.chr1.mask.bed.gz --mask $INDIR/NA19239.chr1.mask.bed.gz \
--mask $MAPDIR/hs37d5_chr1.mask.bed --trio 0,1,2 --trio 3,4,5 \
$INDIR/NA12878.chr1.vcf.gz $INDIR/NA12891.chr1.vcf.gz $INDIR/NA12892.chr1.vcf.gz \
$INDIR/NA19240.chr1.vcf.gz $INDIR/NA19238.chr1.vcf.gz $INDIR/NA19239.chr1.vcf.gz \
> $OUTDIR/EUR_AFR.chr1.multihetsep.txt
1.3 Estimating the effective population size
INPUTDIR=./CF_TRY/msmc2_input
OUTDIR=./CF_TRY/msmc2_input
/data/chaofan/software/msmc2/build/release/msmc2 -t 6 -p 1*2+15*1+1*2 -o $OUTDIR/EUR.msmc2 -I 0,1,2,3 $INPUTDIR/EUR_AFR.chr1.multihetsep.txt
/data/chaofan/software/msmc2/build/release/msmc2 -t 6 -p 1*2+15*1+1*2 -o $OUTDIR/AFR.msmc2 -I 4,5,6,7 $INPUTDIR/EUR_AFR.chr1.multihetsep.txt
1.4 结果可视化
结果是和官方文档一样的,这里只是简单记录下运行步骤和检查msmc2
是否安装成功,具体步骤参阅官方文档。
mu <- 1.25e-8
gen <- 30
afrDat<-read.table("CF_TRY/msmc2_input/AFR.msmc2.final.txt", header=TRUE)
eurDat<-read.table("CF_TRY/msmc2_input/EUR.msmc2.final.txt", header=TRUE)
plot(afrDat$left_time_boundary/mu*gen, (1/afrDat$lambda)/(2*mu), log="x",ylim=c(0,100000),
type="n", xlab="Years ago", ylab="effective population size")
lines(afrDat$left_time_boundary/mu*gen, (1/afrDat$lambda)/(2*mu), type="s", col="red")
lines(eurDat$left_time_boundary/mu*gen, (1/eurDat$lambda)/(2*mu), type="s", col="blue")
legend("topright",legend=c("African", "European"), col=c("red", "blue"), lty=c(1,1))
2. 实例
2.1 Generate masked-file
# work dir
cd /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC
mkdir TEST
cd TEST
通过 snpable 生成 masked_genome, 主要使用 msmc2_scripts 的脚本3。
对原始的run_snpable2.sh
文件稍微进行调整。
确保已经安装以下软件(且在 PATH 里):
- gcc
- samtools
- bcftools
- vcftools
- bwa
- python
00.make_masked.sh#!/bin/sh MSMCTOOLS=/data/chaofan/software/msmc2/msmc-tools # folder with msmc-tools binaries PATH=$PATH:$MSMCTOOLS OUTDIR=/data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/00.Masked_geno # main directory for output files # for msmc_1_call.sh GENOME=/data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna # reference genome fasta prefix=Z7_Masked # prefix of genome masks k=35 scriptdir=/data/chaofan/software/msmc2/msmc2_scripts snpable_script_path=/data/chaofan/software/msmc2/seqbility-20091110 # directory with snpable scripts PATH=$PATH:$snpable_script_path mkdir ${OUTDIR}/snpable cd ${OUTDIR}/snpable echo "Starting extraction of overlapping ${k}-mer subsequences" splitfa $GENOME $k | split -l 20000000 cat x* >> ${prefix}_split.$k # if it can't find splitfa, try adding seqbility to the path using 'PATH=$PATH:/project/WagnerLab/jrick/msmc_Sept2017/snpable/scripts' echo "Aligning ${k}-mer reads to the genome with BWA, then converting to sam file" # the genome needs to be indexed prior to this step-- if it has not already been indexed, run: if [ -f "${GENOME}.bwt" ]; then echo "$GENOME already indexed" else echo "indexing $GENOME" bwa index $GENOME fi echo "aligning reads to genome with BWA and converting to sam" bwa aln -t 8 -R 1000000 -O 3 -E 3 ${GENOME} ${prefix}_split.${k} > ${prefix}_split.${k}.sai bwa samse -f ${prefix}_split.${k}.sam $GENOME ${prefix}_split.${k}.sai ${prefix}_split.${k} echo "reads aligned, starting to generate rawMask" gen_raw_mask.pl ${prefix}_split.${k}.sam > ${prefix}_rawMask.${k}.fa echo "raw mask created as ${prefix}_rawMask.35.fa, now generating final mask with stringency r=50%" gen_mask -l ${k} -r 0.5 ${prefix}_rawMask.${k}.fa > ${prefix}_mask.${k}.50.fa echo "all done! final mask saved as ${prefix}_mask.${k}.50.fa"
成功运行后使用makeMappabilityMask.py
得到每条染色体最终的 mask 文件。可能是 gzip 的版本不对,我在运行脚本的时候报错,这个时候直接去掉 gzip 的相关操作,最后对结果文件进行 gzip 压缩就行。
# 修改makeMappabilityMask.py默认的输入输出
vi /data/chaofan/software/msmc2/msmc-tools/makeMappabilityMask.py
python /data/chaofan/software/msmc2/msmc-tools/makeMappabilityMask.py
2.2 make vcf-file of each chr of each sample
准备工作:
mkdir 01.vcf_bed
# 只保留常染色体
less /data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna | grep \> | cut -d ' ' -f 1 | cut -d '>' -f 2 | sort | head -n 10 > Chr.lst
我们这里挑选 8 个样本进行分析。手动生成一个sample.lst
,每个样本一行。我们小小的修改下bamCaller.py
的内容,让它适用于单倍体。并对结果文件*vcf.gz
进行基因型的Double
。
bamCaller_sample.shsample_ID=$1 OUD=$2 if [ -d "${OUD}" ]; then echo "" else mkdir ${OUD} fi refs=/data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna bam_file=/data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/01.alignment_bam/${sample_ID}.sorted.marked.bam # 这里我用的是平均测序深度 depth=30 # build index for bam if [ -f "${bam_file}.bai" ]; then echo "$bam_file already indexed" else echo "indexing $bam_file" samtools index ${bam_file} fi # bamCaller.py 只能处理单条染色体 [ -e /tmp/zwy ] || mkfifo /tmp/zwy exec 3<>/tmp/zwy rm -rf /tmp/zwy for ((i=1;i<=5;i++)) do echo >&3 done cat /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/Chr.lst | while read chr do read -u3 { samtools mpileup -q 20 -Q 20 -C 50 -u -r ${chr} -f ${refs} ${bam_file} | bcftools call -c -V indels --ploidy 1 | /data/chaofan/software/msmc2/msmc-tools/bamCaller.py ${depth} ${OUD}/${sample_ID}.${chr}.mask.bed.gz | gzip -c > ${OUD}/${sample_ID}.${chr}.vcf.gz python /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/00.double_haploid_vcf.py ${OUD}/${sample_ID}.${chr}.vcf.gz | bgzip -c > ${OUD}/${sample_ID}.${chr}.d.vcf.gz echo >&3 }& done wait
2.3 Combining samples
# mkdir 02.merge_res
INDIR=./01.vcf_bed
OUTDIR=./02.merge_res
MAPDIR=./01.vcf_bed
cat Chr.lst | while read Chr
do
/data/chaofan/software/msmc2/msmc-tools/generate_multihetsep.py --chr ${Chr} \
--mask $INDIR/CCA047.${Chr}.mask.bed.gz --mask $INDIR/CCA133.${Chr}.mask.bed.gz --mask $INDIR/CCA205.${Chr}.mask.bed.gz \
--mask $INDIR/CCA206.${Chr}.mask.bed.gz --mask $INDIR/CCA210.${Chr}.mask.bed.gz --mask $INDIR/CCA215.${Chr}.mask.bed.gz \
--mask $INDIR/CCA298.${Chr}.mask.bed.gz --mask $INDIR/CCA560.${Chr}.mask.bed.gz --mask 00.Masked_geno/Z7_${Chr}.mask.bed.gz \
`ls ${MAPDIR}/*.${Chr}.d.vcf.gz` > $OUTDIR/Clade3.${Chr}.multihetsep.txt &
done
2.4 running MSMC2
/data/chaofan/software/msmc2/build/release/msmc2 -t 30 -p 1*2+15*1+1*2 -o 02.merge_res/Clade3.msmc2 02.merge_res/Clade3.*.multihetsep.txt
References:
https://blog.csdn.net/weixin_45694863/article/details/126812038
https://github.com/stschiff/msmc-tools/blob/master/msmc-tutorial/guide.md
https://github.com/jessicarick/msmc2_scripts/blob/master/run_snpable2.sh