[hg] galaxy 1514: New tool to format short read mapping data as ...

classic Classic list List threaded Threaded
1 message Options
| Threaded
Open this post in threaded view
|

[hg] galaxy 1514: New tool to format short read mapping data as ...

greg
details:   http://www.bx.psu.edu/hg/galaxy/rev/33e06a98b6d8
changeset: 1514:33e06a98b6d8
user:      guru
date:      Wed Sep 17 16:42:08 2008 -0400
description:
New tool to format short read mapping data as a UCSC custom track,.

2 file(s) affected in this change:

tools/metag_tools/mapping_to_ucsc.py
tools/metag_tools/mapping_to_ucsc.xml

diffs (415 lines):

diff -r cf17b5a16eff -r 33e06a98b6d8 tools/metag_tools/mapping_to_ucsc.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/mapping_to_ucsc.py Wed Sep 17 16:42:08 2008 -0400
@@ -0,0 +1,204 @@
+#! /usr/bin/python
+
+from galaxy import eggs
+import sys, tempfile, os
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+def main():
+
+    out_fname = sys.argv[1]
+    in_fname = sys.argv[2]
+    chr_col = int(sys.argv[3])-1
+    coord_col = int(sys.argv[4])-1
+    track_type = sys.argv[5]
+    if track_type == 'coverage' or track_type == 'both':
+        coverage_col = int(sys.argv[6])-1
+        cname = sys.argv[7]
+        cdescription = sys.argv[8]
+        ccolor = sys.argv[9].replace('-',',')
+        cvisibility = sys.argv[10]
+    if track_type == 'snp' or track_type == 'both':
+        if track_type == 'both':
+            j = 5
+        else:
+            j = 0
+        #sname = sys.argv[7+j]
+        sdescription = sys.argv[6+j]
+        svisibility = sys.argv[7+j]
+        #ref_col = int(sys.argv[10+j])-1
+        read_col = int(sys.argv[8+j])-1
+    
+
+    # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically)
+    sorted_infile = tempfile.NamedTemporaryFile()
+    try:
+        os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname))
+    except Exception, exc:
+        stop_err( 'Initialization error -> %s' %str(exc) )
+
+    #generate chr list
+    sorted_infile.seek(0)
+    chr_vals = []
+    for line in file( sorted_infile.name ):
+        line = line.strip()
+        if not(line):
+            continue
+        try:
+            fields = line.split('\t')
+            chr = fields[chr_col]
+            if chr not in chr_vals:
+                chr_vals.append(chr)
+        except:
+            pass
+    if not(chr_vals):  
+        stop_err("Skipped all lines as invalid.")
+        
+    if track_type == 'coverage' or track_type == 'both':
+        if track_type == 'coverage':
+            fout = open( out_fname, "w" )
+        else:
+            fout = tempfile.NamedTemporaryFile()
+        fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
+                      % ( cname, cdescription, ccolor, cvisibility ))
+    if track_type == 'snp' or track_type == 'both':
+        fout_a = tempfile.NamedTemporaryFile()
+        fout_t = tempfile.NamedTemporaryFile()
+        fout_g = tempfile.NamedTemporaryFile()
+        fout_c = tempfile.NamedTemporaryFile()
+        fout_ref = tempfile.NamedTemporaryFile()
+        
+        fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
+                      % ( "Track A", sdescription, '255,0,0', svisibility ))
+        fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
+                      % ( "Track T", sdescription, '0,255,0', svisibility ))
+        fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
+                      % ( "Track G", sdescription, '0,0,255', svisibility ))
+        fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
+                      % ( "Track C", sdescription, '255,0,255', svisibility ))
+        
+        
+    sorted_infile.seek(0)
+    for line in file( sorted_infile.name ):
+        line = line.strip()
+        if not(line):
+            continue
+        try:
+            fields = line.split('\t')
+            chr = fields[chr_col]
+            start = int(fields[coord_col])
+            assert start > 0
+        except:
+            continue
+        try:
+            ind = chr_vals.index(chr)    #encountered chr for the 1st time
+            del chr_vals[ind]
+            prev_start = ''
+            header = "variableStep chrom=%s\n" %(chr)
+            if track_type == 'coverage' or track_type == 'both':
+                coverage = int(fields[coverage_col])
+                line1 = "%s\t%s\n" %(start,coverage)
+                fout.write("%s%s" %(header,line1))
+            if track_type == 'snp' or track_type == 'both':
+                a = t = g = c = 0
+                fout_a.write("%s" %(header))
+                fout_t.write("%s" %(header))
+                fout_g.write("%s" %(header))
+                fout_c.write("%s" %(header))
+                try:
+                    #ref_nt = fields[ref_col].capitalize()
+                    read_nt = fields[read_col].capitalize()
+                    try:
+                        nt_ind = ['A','T','G','C'].index(read_nt)
+                        if nt_ind == 0:
+                            a+=1
+                        elif nt_ind == 1:
+                            t+=1
+                        elif nt_ind == 2:
+                            g+=1
+                        else:
+                            c+=1
+                    except ValueError:
+                        pass
+                except:
+                    pass
+            prev_start = start
+        except ValueError:
+            if start != prev_start:
+                if track_type == 'coverage' or track_type == 'both':
+                    coverage = int(fields[coverage_col])
+                    fout.write("%s\t%s\n" %(start,coverage))
+                if track_type == 'snp' or track_type == 'both':
+                    if a:
+                        fout_a.write("%s\t%s\n" %(prev_start,a))
+                    if t:
+                        fout_t.write("%s\t%s\n" %(prev_start,t))
+                    if g:
+                        fout_g.write("%s\t%s\n" %(prev_start,g))
+                    if c:
+                        fout_c.write("%s\t%s\n" %(prev_start,c))
+                    a = t = g = c = 0
+                    try:
+                        #ref_nt = fields[ref_col].capitalize()
+                        read_nt = fields[read_col].capitalize()
+                        try:
+                            nt_ind = ['A','T','G','C'].index(read_nt)
+                            if nt_ind == 0:
+                                a+=1
+                            elif nt_ind == 1:
+                                t+=1
+                            elif nt_ind == 2:
+                                g+=1
+                            else:
+                                c+=1
+                        except ValueError:
+                            pass
+                    except:
+                        pass
+                prev_start = start
+            else:
+                if track_type == 'snp' or track_type == 'both':
+                    try:
+                        #ref_nt = fields[ref_col].capitalize()
+                        read_nt = fields[read_col].capitalize()
+                        try:
+                            nt_ind = ['A','T','G','C'].index(read_nt)
+                            if nt_ind == 0:
+                                a+=1
+                            elif nt_ind == 1:
+                                t+=1
+                            elif nt_ind == 2:
+                                g+=1
+                            else:
+                                c+=1
+                        except ValueError:
+                            pass
+                    except:
+                        pass
+    
+    if track_type == 'snp' or track_type == 'both':
+        if a:
+            fout_a.write("%s\t%s\n" %(prev_start,a))
+        if t:
+            fout_t.write("%s\t%s\n" %(prev_start,t))
+        if g:
+            fout_g.write("%s\t%s\n" %(prev_start,g))
+        if c:
+            fout_c.write("%s\t%s\n" %(prev_start,c))
+            
+        fout_a.seek(0)
+        fout_g.seek(0)
+        fout_t.seek(0)
+        fout_c.seek(0)    
+    
+    if track_type == 'snp':
+        os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
+    elif track_type == 'both':
+        fout.seek(0)
+        os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff -r cf17b5a16eff -r 33e06a98b6d8 tools/metag_tools/mapping_to_ucsc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/mapping_to_ucsc.xml Wed Sep 17 16:42:08 2008 -0400
@@ -0,0 +1,202 @@
+<tool id="mapToUCSC" name="Format mapping data" version="1.0.0">
+  <description> as UCSC custom track</description>
+  <command interpreter="python">
+   mapping_to_ucsc.py
+   $out_file1
+   $input
+   $chr_col
+   $coord_col
+   $track.track_type
+   #if $track.track_type == "coverage" or $track.track_type == "both"
+   $track.coverage_col
+    "${track.cname}"
+    "${track.cdescription}"
+    "${track.ccolor}"
+    "${track.cvisibility}"
+    #end if
+    #if $track.track_type == "snp" or $track.track_type == "both"
+    "${track.sdescription}"
+    "${track.svisibility}"
+     $track.col2
+    #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Select mapping data"/>
+    <param name="chr_col" type="data_column" data_ref="input" label="Column for reference chromosome" />
+    <param name="coord_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for reference co-ordinate" />
+    <conditional name="track">
+      <param name="track_type" type="select" label="Display">
+     <option value="snp" selected="true">SNPs</option>
+        <option value="coverage">Read coverage</option>
+     <option value="both">Both</option>
+      </param>
+      <when value = "coverage">
+      <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" />
+      <param name="cname" type="text" size="15" value="User Track" label="Coverage track name">
+        <validator type="length" max="15"/>
+      </param>
+      <param name="cdescription" type="text" value="User Supplied Coverage Track (from Galaxy)" label="Coverage track description">
+        <validator type="length" max="60" size="15"/>
+      </param>
+      <param label="Coverage track Color" name="ccolor" type="select">
+            <option selected="yes" value="0-0-0">Black</option>
+            <option value="255-0-0">Red</option>
+            <option value="0-255-0">Green</option>
+            <option value="0-0-255">Blue</option>
+            <option value="255-0-255">Magenta</option>
+            <option value="0-255-255">Cyan</option>
+            <option value="255-215-0">Gold</option>
+            <option value="160-32-240">Purple</option>
+            <option value="255-140-0">Orange</option>
+            <option value="255-20-147">Pink</option>
+            <option value="92-51-23">Dark Chocolate</option>
+            <option value="85-107-47">Olive green</option>
+      </param>
+      <param label="Coverage track Visibility" name="cvisibility" type="select">
+            <option selected="yes" value="1">Dense</option>
+            <option value="2">Full</option>
+            <option value="3">Pack</option>
+            <option value="4">Squish</option>
+            <option value="0">Hide</option>
+      </param>
+      </when>
+      
+      <when value = "snp">
+      <!--
+      <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" />
+       -->
+      <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" />
+      <!--
+      <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name">
+        <validator type="length" max="15"/>
+      </param>
+       -->
+      <param name="sdescription" type="text" value="User Supplied Track (from Galaxy)" label="SNP track description">
+        <validator type="length" max="60" size="15"/>
+      </param>
+      <param label="SNP track Visibility" name="svisibility" type="select">
+            <option selected="yes" value="1">Dense</option>
+            <option value="2">Full</option>
+            <option value="3">Pack</option>
+            <option value="4">Squish</option>
+            <option value="0">Hide</option>
+      </param>
+      </when>
+      
+      <when value = "both">
+      <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" />
+      <param name="cname" type="text" size="15" value="User Track" label="Coverage track name">
+        <validator type="length" max="15"/>
+      </param>
+      <param name="cdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="Coverage track description">
+        <validator type="length" max="60"/>
+      </param>
+      <param label="Coverage track Color" name="ccolor" type="select">
+            <option selected="yes" value="0-0-0">Black</option>
+            <option value="255-0-0">Red</option>
+            <option value="0-255-0">Green</option>
+            <option value="0-0-255">Blue</option>
+            <option value="255-0-255">Magenta</option>
+            <option value="0-255-255">Cyan</option>
+            <option value="255-215-0">Gold</option>
+            <option value="160-32-240">Purple</option>
+            <option value="255-140-0">Orange</option>
+            <option value="255-20-147">Pink</option>
+            <option value="92-51-23">Dark Chocolate</option>
+            <option value="85-107-47">Olive green</option>
+      </param>
+      <param label="Coverage track Visibility" name="cvisibility" type="select">
+            <option selected="yes" value="1">Dense</option>
+            <option value="2">Full</option>
+            <option value="3">Pack</option>
+            <option value="4">Squish</option>
+            <option value="0">Hide</option>
+      </param>
+      <!--
+      <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" />
+       -->
+      <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" />
+      <!--
+      <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name">
+        <validator type="length" max="15"/>
+      </param>
+       -->
+      <param name="sdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="SNP track description">
+        <validator type="length" max="60"/>
+      </param>
+      <param label="SNP track Visibility" name="svisibility" type="select">
+            <option selected="yes" value="1">Dense</option>
+            <option value="2">Full</option>
+            <option value="3">Pack</option>
+            <option value="4">Squish</option>
+            <option value="0">Hide</option>
+      </param>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="customtrack" name="out_file1"/>
+  </outputs>
+
+  
+ <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool formats mapping data generated by short read mappers, as a custom track that can be displayed at UCSC genome browser.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+This tool requires the mapping data to contain at least the following information:
+
+chromosome, genome coordinate, read nucleotide (if option to display is SNPs), read coverage (if option to display is Read coverage).
+
+-----
+
+**Example**
+
+For the following Mapping data::
+
+   #chr g_start read_id          read_coord g_nt read_nt qual read_coverage
+   chrM    1   1:29:1672:1127/1    11        G    G       40  134
+   chrM    1   1:32:93:933/1       4         G    A       40  134
+   chrM    1   1:34:116:2032/1     11        G    A       40  134
+   chrM    1   1:39:207:964/1      1         G    G       40  134
+   chrM    2   1:3:359:848/1       1         G    C       40  234
+   chrM    2   1:40:1435:1013/1    1         G    G       40  234
+   chrM    3   1:40:730:972/1      9         G    G       40  334
+   chrM    4   1:42:1712:921/2     31        G    T       35  434
+   chrM    4   1:44:1649:493/1     4         G    G       40  434
+
+running this tool to display both SNPs and Read coverage will return the following tracks, containing aggregated data per genome co-ordinate::
+
+   track type=wiggle_0 name="Coverage Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1
+   variableStep chrom=chrM
+   1   134
+   2   234
+   3   334
+   4   434
+   track type=wiggle_0 name="Track A" description="User Supplied SNP Track (from Galaxy)" color=255,0,0 visibility=1
+   variableStep chrom=chrM
+   1   2
+   track type=wiggle_0 name="Track T" description="User Supplied SNP Track (from Galaxy)" color=0,255,0 visibility=1
+   variableStep chrom=chrM
+   4   1
+   track type=wiggle_0 name="Track G" description="User Supplied SNP Track (from Galaxy)" color=0,0,255 visibility=1
+   variableStep chrom=chrM
+   1   2
+   2   1
+   3   1
+   4   1
+   track type=wiggle_0 name="Track C" description="User Supplied SNP Track (from Galaxy)" color=255,0,255 visibility=1
+   variableStep chrom=chrM
+   2   1
+  
+  </help>  
+</tool>