This track shows the liftOver of ncRNA transcripts from GENCODEv32 to the Repeat Browser. "Non Coding transcript" in this case means that the exon defined by GENCODE had a defined thickStart and thickEnd that were equal in the knownGene track. At the end of this process, we compared the biotypes of the resulting genes and confirmed all genes in this track (and therefore all blocks in this track) are entirely non-coding. Note the enst and coords fields are truncated for a few records (to fit within string character limits).
Use this track with GENCODEv32 UTRs (non-coding portions of coding transcripts) & GENCODEv32 CDS (coding regions of exons).
The slightly convulted approach here is a result of a small bug in liftOver and pslMap that messes us thickStart/thickStop. Also lifting the bed12 instead of first converting to exons will result in significantly fewer matches since failure to lift multiple exons will result in "partially deleted in new" output in the unmapped file.
bedToExons knownGene32_12.bed knownGene32_exons.bed
#if thickstart = thickend, there is no coding. This may seem to be messy but comparison of ENST records with biotypes from GENCODE gff at end confirms this is true.
awk '$7 == $8' knownGene32_12.bed > ncRNA_knownGene32_temp.bed
#uniqueToFileOne is a short one liner: diff -U $(wc -l < $1) $1 $2 | sed -n 's/^-//p' that returns the contents of file1 that are not in file2
uniqueToFileOne.sh knownGene32_12.bed ncRNA_knownGene32_temp.bed | tail -n+2 > cds_knownGene32_12.bed
#make exons from bed12
bedToExons ncRNA_knownGene32_temp.bed ncRNA_knownGene32_exons.bed
bedToExons -cdsOnly cds_knownGene32_12.bed temp.bed
bedToExons cds_knownGene32_12.bed knownGene32_exons.bed
#thickStart=thickEnd is detected as "coding" by bedToExons, we don't want those in our list of "coding" exons they are ncRNA
awk '$2 != $3' temp.bed > cds_knownGene32_exons.bed
#subtract the coding portions of exons from all exons (no ncRNA) to get the UTRs of coding transcripts
bedtools subtract -a knownGene32_exons.bed -b cds_knownGene32_exons.bed > utr_knownGene32_exons.bed
#lift all 3 files
liftOver ncRNA_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_ncRNA_knownGene32_12.bed RepBro_ncRNA_knownGene32_12.unmapped -multiple
liftOver cds_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_cds_knownGene32_exons.bed RepBro_cds_knownGene32_exons.unmapped -multiple
liftOver utr_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_utr_knownGene32_exons.bed RepBro_utr_knownGene32_exons.unmapped -multiple
#fix thickstart and thickstop, coding exons are all thick, non-coding exons are all not thick (exons that had both portions were split in two previously).
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_ncRNA_knownGene32_12.bed > t1.bed
awk -v OFS="\t" '{$7=$2;$8=$3;$9=0; print $0}' RepBro_cds_knownGene32_exons.bed > t2.bed
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_utr_knownGene32_exons.bed > t3.bed
#add additional fields
join -1 4 -2 1 <(sort -k4 RepBro_ncRNA_knownGene32_12.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_ncRNA_knownGene32_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_cds_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$4,$1,$7}'| sort | uniq | bedtools sort > RepBro_cds_knownGene32_exons_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_utr_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_utr_knownGene32_exons_14.bed
#grouping by the name of the gene let's us collapse isoforms of the same gene that have the same genomic portion mapping to the exact same consensus position into one display record.
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_cds_knownGene32_exons_14.bed > collapse_RepBro_cds_knownGene32_exons_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_ncRNA_knownGene32_14.bed > collapse_RepBro_ncRNA_knownGene32_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_utr_knownGene32_exons_14.bed > collapse_RepBro_utr_knownGene32_exons_14.bed
#combine transcripts with same name and RepeatBrowser coordinate to make output easier to read
bedToBigBed collapse_RepBro_cds_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_cds.bb -type=bed8+2
bedToBigBed collapse_RepBro_ncRNA_knownGene32_14.bed ../hg38reps/hg38reps.sizes gencode_ncRNA.bb -type=bed8+2
bedToBigBed collapse_RepBro_utr_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_utr.bb -type=bed8+2
Harrow J, Frankish A, Gonzalez JM, Tapanari E, Diekhans M, Kokocinski F, Aken BL, Barrell D, Zadissa A, Searle S et al. GENCODE: the reference human genome annotation for The ENCODE Project. Genome Res. 2012 Sep;22(9):1760-74. PMID: 22955987; PMC: PMC3431492
Harrow J, Denoeud F, Frankish A, Reymond A, Chen CK, Chrast J, Lagarde J, Gilbert JG, Storey R, Swarbreck D et al. GENCODE: producing a reference annotation for ENCODE. Genome Biol. 2006;7 Suppl 1:S4.1-9. PMID: 16925838; PMC: PMC1810553
A full list of GENCODE publications is available at The GENCODE Project web site.
Email: markd@ucsc.edu or mhaeussl@ucsc.edu