This track shows the liftOver of coding sequence exons from GENCODEv32 to the Repeat Browser. "Coding sequence" in this case means that the exon defined by GENCODE had a defined thickStart and thickEnd in the knownGene track. If the exon contained both coding and non-coding regions (e.g. an exon with the 5'UTR and the start codon) the exon was split into two with the UTR visualized in the GENCODEv32 UTRs track. Therefore all blocks in this track are entirely coding. Note the enst and coords fields are truncated for a few records (to fit within string character limits).
Use this track with GENCODEv32 ncRNA (no thickStart/thickEnd, transcript product is non-coding) & GENCODEv32 UTRs (exons are part of a coding transcript but these regions do not code).
The slightly convulted approach here is a result of a small bug in liftOver and pslMap that messes us thickStart/thickStop. Also lifting the bed12 instead of first converting to exons will result in significantly fewer matches since failure to lift multiple exons will result in "partially deleted in new" output in the unmapped file.
bedToExons knownGene32_12.bed knownGene32_exons.bed
#if thickstart = thickend, there is no coding. This may seem to be messy but comparison of ENST records with biotypes from GENCODE gff at end confirms this is true.
awk '$7 == $8' knownGene32_12.bed > ncRNA_knownGene32_temp.bed
#uniqueToFileOne is a short one liner: diff -U $(wc -l < $1) $1 $2 | sed -n 's/^-//p' that returns the contents of file1 that are not in file2 knownGene32_12.bed ncRNA_knownGene32_temp.bed | tail -n+2 > cds_knownGene32_12.bed
#make exons from bed12
bedToExons ncRNA_knownGene32_temp.bed ncRNA_knownGene32_exons.bed
bedToExons -cdsOnly cds_knownGene32_12.bed temp.bed
bedToExons cds_knownGene32_12.bed knownGene32_exons.bed
#thickStart=thickEnd is detected as "coding" by bedToExons, we don't want those in our list of "coding" exons they are ncRNA
awk '$2 != $3' temp.bed > cds_knownGene32_exons.bed
#subtract the coding portions of exons from all exons (no ncRNA) to get the UTRs of coding transcripts
bedtools subtract -a knownGene32_exons.bed -b cds_knownGene32_exons.bed > utr_knownGene32_exons.bed
#lift all 3 files
liftOver ncRNA_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_ncRNA_knownGene32_12.bed RepBro_ncRNA_knownGene32_12.unmapped -multiple
liftOver cds_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_cds_knownGene32_exons.bed RepBro_cds_knownGene32_exons.unmapped -multiple
liftOver utr_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_utr_knownGene32_exons.bed RepBro_utr_knownGene32_exons.unmapped -multiple
#fix thickstart and thickstop, coding exons are all thick, non-coding exons are all not thick (exons that had both portions were split in two previously).
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_ncRNA_knownGene32_12.bed > t1.bed
awk -v OFS="\t" '{$7=$2;$8=$3;$9=0; print $0}' RepBro_cds_knownGene32_exons.bed > t2.bed
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_utr_knownGene32_exons.bed > t3.bed
#add additional fields
join -1 4 -2 1 <(sort -k4 RepBro_ncRNA_knownGene32_12.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_ncRNA_knownGene32_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_cds_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$4,$1,$7}'| sort | uniq | bedtools sort > RepBro_cds_knownGene32_exons_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_utr_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_utr_knownGene32_exons_14.bed
#grouping by the name of the gene let's us collapse isoforms of the same gene that have the same genomic portion mapping to the exact same consensus position into one display record.
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_cds_knownGene32_exons_14.bed > collapse_RepBro_cds_knownGene32_exons_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_ncRNA_knownGene32_14.bed > collapse_RepBro_ncRNA_knownGene32_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_utr_knownGene32_exons_14.bed > collapse_RepBro_utr_knownGene32_exons_14.bed
#combine transcripts with same name and RepeatBrowser coordinate to make output easier to read
bedToBigBed collapse_RepBro_cds_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes -type=bed8+2
bedToBigBed collapse_RepBro_ncRNA_knownGene32_14.bed ../hg38reps/hg38reps.sizes -type=bed8+2
bedToBigBed collapse_RepBro_utr_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes -type=bed8+2
