wget https://zerkalo.curie.fr/partage/HiC-Pro/HiCPro_testdata.tar.gz && tar -zxvf HiCPro_testdata.tar.gz
It will generate a folder test_data with 2 subfolders:
dixon_2M
dixon_2M_2
In addition, on file in the current folder: config_test_latest.txt
The Hic-pro config file config_test_latest.txt must be edited.
wget ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes/hg19.zip
mkdir hg19
unzip hg19.zip -d hg19
It will download the bowtie2 index files for human genome hg19.
The final file will look like this:
################################
#Please change the variable settings below if necessary
#########################################################################
##Paths and Settings - Do not edit !
#########################################################################
TMP_DIR = tmp
LOGS_DIR = logs
BOWTIE2_OUTPUT_DIR = bowtie_results
MAPC_OUTPUT = hic_results
RAW_DIR = rawdata
#######################################################################
##SYSTEM - PBS - Start Editing Here !!
### #commenting the PBS lines and keeping N_CPU and LOGFILE
#######################################################################
**N_CPU = 8**
LOGFILE = hicpro.log
#JOB_NAME = IMR90_split
#JOB_MEM = 10gb
#JOB_WALLTIME = 6:00:00
#JOB_QUEUE = batch
#JOB_MAIL = nservant@curie.fr
#########################################################################
##Data
#########################################################################
PAIR1_EXT = _R1
PAIR2_EXT = _R2
#######################################################################
##Alignment options
#######################################################################
FORMAT = phred33
MIN_MAPQ = 0
### #here is the path where we dowloaded the hg19 index
**BOWTIE2_IDX_PATH = hg19/**
BOWTIE2_GLOBAL_OPTIONS = --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder
BOWTIE2_LOCAL_OPTIONS = --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder
#######################################################################
##Annotation files
#######################################################################
### #Name of index files. Here for this tutorial we will keep like this, HiC-pro is provided with chrom_hg19.sizes
**REFERENCE_GENOME = hg19**
GENOME_SIZE = chrom_hg19.sizes
#######################################################################
##Allele specific
#######################################################################
ALLELE_SPECIFIC_SNP =
#######################################################################
##Digestion Hi-C
#######################################################################
### #this file is also provided for hg19 used in this tutorial
GENOME_FRAGMENT = HindIII_resfrag_hg19.bed
LIGATION_SITE = AAGCTAGCTT
MIN_FRAG_SIZE = 100
MAX_FRAG_SIZE = 100000
MIN_INSERT_SIZE = 100
MAX_INSERT_SIZE = 600
#######################################################################
##Hi-C processing
#######################################################################
MIN_CIS_DIST =
GET_ALL_INTERACTION_CLASSES = 1
GET_PROCESS_SAM = 1
RM_SINGLETON = 1
RM_MULTI = 1
RM_DUP = 1
#######################################################################
##Contact Maps
#######################################################################
### #bin size for define the resolution
BIN_SIZE = 500000 1000000
MATRIX_FORMAT = upper
#######################################################################
##ICE Normalization
#######################################################################
MAX_ITER = 100
FILTER_LOW_COUNT_PERC = 0.02
FILTER_HIGH_COUNT_PERC = 0
EPS = 0.1
HiC-Pro -i test_data -o out_put_test_data -c config_test_latest.txt
ls out_put_test_data/hic_results/
data matrix pic
ls out_put_test_data/hic_results/matrix
dixon_2M dixon_2M_2
ls out_put_test_data/hic_results/matrix/dixon_2M
iced raw
# Install HiTC package
source("https://bioconductor.org/biocLite.R")
biocLite("HiTC")
library(HiTC)
showClass("HTCexp")
showClass("HTClist")
hic<-importC("/beegfs/group_bit/data/projects/departments/Bioinformatics/bit_HiC_tutorial/out_test_data/hic_results/matrix/dixon_2M/iced/500000/dixon_2M_500000_iced.matrix","../out_test_data/hic_results/matrix/dixon_2M/raw/500000/dixon_2M_500000_ord.bed")
hic
## Descriptive statistics
head(summary(hic))
CQC(hic, winsize = 1e+06, dev.new=FALSE, hist.dist=FALSE)
## Go back to a smaller dataset (chr5, 6, 7) at lower resolution
sset <- reduce(hic, chr=c("chr5","chr6","chr7"))
hic90_500 <- HTClist(mclapply(sset, binningC,binsize=500000, bin.adjust=FALSE, method="sum", step=1))
mapC(hic90_500)
hic_x.binned <- binningC(hic$chrXchrX, binsize=500000, method="median", step=3)
## Look at exptected counts chrX
hicexp <- getExpectedCounts(hic_x.binned, method="loess", stdev=TRUE, plot=TRUE)
mapC(hic_x.binned,title="chrX")
# Annotate graphics with gene features:
# Get the bed file from http://genome.ucsc.edu/cgi-bin/hgTables?hgsid=661039579_qIn3z1lzkaAypE9M4fryJAGV12wN&clade=mammal&org=&db=hg19&hgta_group=genes&hgta_track=knownGene&hgta_table=knownGene&hgta_regionType=genome&position=&hgta_outputType=primaryTable&hgta_outFileName=
require(rtracklayer)
gene <- import(file.path("../hg19.bed"),format="bed")
mapC(hic_x.binned,tracks=list(RefSeqGene=gene),maxrange=10,ti="chrX contacts")
#The following code shows how to focus on TADs
TAD <- extractRegion(hic$chr6chr6, chr="chr6", from=50e6, to=58e6)
plot(TAD, maxrange=50, col.pos=c("white", "orange", "red", "black"))
## Data Normalization by Expected number of Counts
hiC14norm <- normPerExpected(hiC14, method="loess")
mapC(HTClist(hiC14norm), log.data=TRUE)
## Correlation Map of Chromosome 14
intdata(hiC14norm) <- HiTC:::sparseCor(intdata(hiC14norm))
mapC(HTClist(hiC14norm), maxrange=1, minrange=-1,col.pos=c("black", "red"), col.neg=c("blue","black"))
# Principal Component Analysis
# empty due to low number of reads
pc <- pca.hic(hiC14, normPerExpected=TRUE, method="loess", npc=1)