MSMC2-Tutorial


0. 环境配置

cd /data/chaofan/software/
git clone https://github.com/stschiff/msmc2.git
cd msmc2

# dmd
wget http://downloads.dlang.org/releases/2.x/2.094.2/dmd.2.094.2.linux.tar.xz
xz -d dmd.2.094.2.linux.tar.xz ; tar -xvf dmd.2.094.2.linux.tar
echo  'export PATH="/data/chaofan/software/msmc2/dmd2/linux/bin64:$PATH"' >> ~/.bashrc
source ~/.bashrc

# gsl
wget http://mirror.rit.edu/gnu/gsl/gsl-2.6.tar.gz
tar -pzxvf gsl-2.6.tar.gz
cd gsl-2.6
./configure --prefix=/data/chaofan/software/msmc2/gsl
make
make install
cd ..

# make msmc2
## 修改原始的SLDIR, 改成你安装的
vi Makefile
'
SLDIR=/data/chaofan/software/msmc2/gsl/lib
'

make

如出现错误:

x86_64-conda-linux-gnu/lib/../lib64/libgcc_s.so.1: undefined reference to memcpy@GLIBC_2.14' collect2: error: ld returned 1 exit status Error: linker exited with status 1 make: *** [Makefile:20:build/release/msmc2] error1 退出conda环境再make`

conda deactivate

make

# 下载配套工具
git clone https://github.com/stschiff/msmc-tools.git
git clone https://github.com/jessicarick/msmc2_scripts.git

# mask file
wget http://lh3lh3.users.sourceforge.net/download/seqbility-20091110.tar.bz2
tar jxfv seqbility-20091110.tar.bz2
cd seqbility-20091110
make
cd ..

# 下载测试数据  手动下载
"https://share.eva.mpg.de/index.php/s/swpTM4mNK7gG7nw/"

1. 测试 MSMC2

1.1 Generating consensus sequences for each sample

# 把网页下载的数据拖到这里
cd /data/chaofan/software/msmc2 ; mkdir TEST_data ; cd TEST_data

unzip MSMC-tutorial-files.zip ; cd MSMC-tutorial-files

现在开始测试,一共 6 个样本。

mkdir -p ./CF_TRY/consensus_calls

MASTERVARDIR=./cg_data
OUTDIR=./CF_TRY/consensus_calls

CHR=chr1
for IND in NA19238 NA19239 NA19240 NA12878 NA12891 NA12892; do
    MASTERVAR=$(ls $MASTERVARDIR/masterVarBeta-$IND-*.tsv.chr1.bz2)
    OUT_MASK=$OUTDIR/$IND.$CHR.mask.bed.gz
    OUT_VCF=$OUTDIR/$IND.$CHR.vcf.gz
    /data/chaofan/software/msmc2/msmc-tools/cgCaller.py $CHR $IND $OUT_MASK $MASTERVAR | gzip -c > $OUT_VCF
done

1.2 Combining samples

mkdir -p ./CF_TRY/msmc2_input

INDIR=./CF_TRY/consensus_calls
OUTDIR=./CF_TRY/msmc2_input
MAPDIR=./
/data/chaofan/software/msmc2/msmc-tools/generate_multihetsep.py --chr 1 \
    --mask $INDIR/NA12878.chr1.mask.bed.gz --mask $INDIR/NA12891.chr1.mask.bed.gz --mask $INDIR/NA12892.chr1.mask.bed.gz \
    --mask $INDIR/NA19240.chr1.mask.bed.gz --mask $INDIR/NA19238.chr1.mask.bed.gz --mask $INDIR/NA19239.chr1.mask.bed.gz \
    --mask $MAPDIR/hs37d5_chr1.mask.bed --trio 0,1,2 --trio 3,4,5 \
    $INDIR/NA12878.chr1.vcf.gz $INDIR/NA12891.chr1.vcf.gz $INDIR/NA12892.chr1.vcf.gz \
    $INDIR/NA19240.chr1.vcf.gz $INDIR/NA19238.chr1.vcf.gz $INDIR/NA19239.chr1.vcf.gz \
    > $OUTDIR/EUR_AFR.chr1.multihetsep.txt

1.3 Estimating the effective population size

INPUTDIR=./CF_TRY/msmc2_input
OUTDIR=./CF_TRY/msmc2_input

/data/chaofan/software/msmc2/build/release/msmc2 -t 6 -p 1*2+15*1+1*2 -o $OUTDIR/EUR.msmc2 -I 0,1,2,3 $INPUTDIR/EUR_AFR.chr1.multihetsep.txt
/data/chaofan/software/msmc2/build/release/msmc2 -t 6 -p 1*2+15*1+1*2 -o $OUTDIR/AFR.msmc2 -I 4,5,6,7 $INPUTDIR/EUR_AFR.chr1.multihetsep.txt

1.4 结果可视化

结果是和官方文档一样的,这里只是简单记录下运行步骤和检查msmc2是否安装成功,具体步骤参阅官方文档。

mu <- 1.25e-8
gen <- 30
afrDat<-read.table("CF_TRY/msmc2_input/AFR.msmc2.final.txt", header=TRUE)
eurDat<-read.table("CF_TRY/msmc2_input/EUR.msmc2.final.txt", header=TRUE)
plot(afrDat$left_time_boundary/mu*gen, (1/afrDat$lambda)/(2*mu), log="x",ylim=c(0,100000),
     type="n", xlab="Years ago", ylab="effective population size")
lines(afrDat$left_time_boundary/mu*gen, (1/afrDat$lambda)/(2*mu), type="s", col="red")
lines(eurDat$left_time_boundary/mu*gen, (1/eurDat$lambda)/(2*mu), type="s", col="blue")
legend("topright",legend=c("African", "European"), col=c("red", "blue"), lty=c(1,1))

2. 实例

2.1 Generate masked-file

# work dir
cd /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC

mkdir TEST
cd TEST

通过 snpable 生成 masked_genome, 主要使用 msmc2_scripts 的脚本3
对原始的run_snpable2.sh文件稍微进行调整。
确保已经安装以下软件(且在 PATH 里):

  • gcc
  • samtools
  • bcftools
  • vcftools
  • bwa
  • python
00.make_masked.sh
#!/bin/sh MSMCTOOLS=/data/chaofan/software/msmc2/msmc-tools # folder with msmc-tools binaries PATH=$PATH:$MSMCTOOLS OUTDIR=/data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/00.Masked_geno # main directory for output files # for msmc_1_call.sh GENOME=/data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna # reference genome fasta prefix=Z7_Masked # prefix of genome masks k=35 scriptdir=/data/chaofan/software/msmc2/msmc2_scripts snpable_script_path=/data/chaofan/software/msmc2/seqbility-20091110 # directory with snpable scripts PATH=$PATH:$snpable_script_path mkdir ${OUTDIR}/snpable cd ${OUTDIR}/snpable echo "Starting extraction of overlapping ${k}-mer subsequences" splitfa $GENOME $k | split -l 20000000 cat x* >> ${prefix}_split.$k # if it can't find splitfa, try adding seqbility to the path using 'PATH=$PATH:/project/WagnerLab/jrick/msmc_Sept2017/snpable/scripts' echo "Aligning ${k}-mer reads to the genome with BWA, then converting to sam file" # the genome needs to be indexed prior to this step-- if it has not already been indexed, run: if [ -f "${GENOME}.bwt" ]; then echo "$GENOME already indexed" else echo "indexing $GENOME" bwa index $GENOME fi echo "aligning reads to genome with BWA and converting to sam" bwa aln -t 8 -R 1000000 -O 3 -E 3 ${GENOME} ${prefix}_split.${k} > ${prefix}_split.${k}.sai bwa samse -f ${prefix}_split.${k}.sam $GENOME ${prefix}_split.${k}.sai ${prefix}_split.${k} echo "reads aligned, starting to generate rawMask" gen_raw_mask.pl ${prefix}_split.${k}.sam > ${prefix}_rawMask.${k}.fa echo "raw mask created as ${prefix}_rawMask.35.fa, now generating final mask with stringency r=50%" gen_mask -l ${k} -r 0.5 ${prefix}_rawMask.${k}.fa > ${prefix}_mask.${k}.50.fa echo "all done! final mask saved as ${prefix}_mask.${k}.50.fa"

成功运行后使用makeMappabilityMask.py得到每条染色体最终的 mask 文件。可能是 gzip 的版本不对,我在运行脚本的时候报错,这个时候直接去掉 gzip 的相关操作,最后对结果文件进行 gzip 压缩就行。

# 修改makeMappabilityMask.py默认的输入输出
vi /data/chaofan/software/msmc2/msmc-tools/makeMappabilityMask.py
python /data/chaofan/software/msmc2/msmc-tools/makeMappabilityMask.py

2.2 make vcf-file of each chr of each sample

准备工作:

mkdir 01.vcf_bed

# 只保留常染色体
less /data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna | grep \> | cut -d ' ' -f 1 | cut -d '>' -f 2 | sort | head -n 10 > Chr.lst

我们这里挑选 8 个样本进行分析。手动生成一个sample.lst,每个样本一行。我们小小的修改下bamCaller.py的内容,让它适用于单倍体。并对结果文件*vcf.gz进行基因型的Double

bamCaller_sample.sh
sample_ID=$1 OUD=$2 if [ -d "${OUD}" ]; then echo "" else mkdir ${OUD} fi refs=/data/chaofan/projects/06.daily/06.yelu/02.vcf_anno/Alternaria_Alternata_Z7_genomic.fna bam_file=/data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/01.alignment_bam/${sample_ID}.sorted.marked.bam # 这里我用的是平均测序深度 depth=30 # build index for bam if [ -f "${bam_file}.bai" ]; then echo "$bam_file already indexed" else echo "indexing $bam_file" samtools index ${bam_file} fi # bamCaller.py 只能处理单条染色体 [ -e /tmp/zwy ] || mkfifo /tmp/zwy exec 3<>/tmp/zwy rm -rf /tmp/zwy for ((i=1;i<=5;i++)) do echo >&3 done cat /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/Chr.lst | while read chr do read -u3 { samtools mpileup -q 20 -Q 20 -C 50 -u -r ${chr} -f ${refs} ${bam_file} | bcftools call -c -V indels --ploidy 1 | /data/chaofan/software/msmc2/msmc-tools/bamCaller.py ${depth} ${OUD}/${sample_ID}.${chr}.mask.bed.gz | gzip -c > ${OUD}/${sample_ID}.${chr}.vcf.gz python /data/chaofan/projects/06.daily/06.yelu/05.add_data/02.GATK_pipeline/07.population/03.clean_sample/00.population_structure/03.MSMC/TEST/00.double_haploid_vcf.py ${OUD}/${sample_ID}.${chr}.vcf.gz | bgzip -c > ${OUD}/${sample_ID}.${chr}.d.vcf.gz echo >&3 }& done wait

2.3 Combining samples

# mkdir 02.merge_res

INDIR=./01.vcf_bed
OUTDIR=./02.merge_res
MAPDIR=./01.vcf_bed
cat Chr.lst | while read Chr
do
/data/chaofan/software/msmc2/msmc-tools/generate_multihetsep.py --chr ${Chr} \
    --mask $INDIR/CCA047.${Chr}.mask.bed.gz --mask $INDIR/CCA133.${Chr}.mask.bed.gz --mask $INDIR/CCA205.${Chr}.mask.bed.gz \
    --mask $INDIR/CCA206.${Chr}.mask.bed.gz --mask $INDIR/CCA210.${Chr}.mask.bed.gz --mask $INDIR/CCA215.${Chr}.mask.bed.gz \
    --mask $INDIR/CCA298.${Chr}.mask.bed.gz --mask $INDIR/CCA560.${Chr}.mask.bed.gz --mask 00.Masked_geno/Z7_${Chr}.mask.bed.gz \
    `ls ${MAPDIR}/*.${Chr}.d.vcf.gz` > $OUTDIR/Clade3.${Chr}.multihetsep.txt &
done

2.4 running MSMC2

/data/chaofan/software/msmc2/build/release/msmc2 -t 30 -p 1*2+15*1+1*2 -o 02.merge_res/Clade3.msmc2 02.merge_res/Clade3.*.multihetsep.txt

References:

https://blog.csdn.net/weixin_45694863/article/details/126812038

https://github.com/stschiff/msmc-tools/blob/master/msmc-tutorial/guide.md

https://github.com/jessicarick/msmc2_scripts/blob/master/run_snpable2.sh


文章作者: zhangchaofan
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 zhangchaofan !
评论
  目录