[hg] galaxy 1552: Maf stats will now use bitset instead of numpy...

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[hg] galaxy 1552: Maf stats will now use bitset instead of numpy...

Nate Coraor (nate@bx.psu.edu)
details:   http://www.bx.psu.edu/hg/galaxy/rev/4b9feffc3ce5
changeset: 1552:4b9feffc3ce5
user:      Dan Blankenberg <[hidden email]>
date:      Thu Oct 09 11:23:33 2008 -0400
description:
Maf stats will now use bitset instead of numpy.zerros.

3 file(s) affected in this change:

test-data/maf_stats_summary_out.dat
tools/maf/maf_stats.py
tools/maf/maf_stats.xml

diffs (85 lines):

diff -r fdbf15ea1f8a -r 4b9feffc3ce5 test-data/maf_stats_summary_out.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/maf_stats_summary_out.dat Thu Oct 09 11:23:33 2008 -0400
@@ -0,0 +1,9 @@
+#species nucleotides coverage
+mm5 17186 0.9555
+panTro1 17732 0.9858
+danRer1 12263 0.6818
+canFam1 17034 0.9470
+fr1 11318 0.6292
+galGal2 13522 0.7518
+hg17 17987 1.0000
+rn3 16410 0.9123
diff -r fdbf15ea1f8a -r 4b9feffc3ce5 tools/maf/maf_stats.py
--- a/tools/maf/maf_stats.py Wed Oct 08 12:00:16 2008 -0400
+++ b/tools/maf/maf_stats.py Thu Oct 09 11:23:33 2008 -0400
@@ -8,7 +8,7 @@
 from galaxy import eggs
 import pkg_resources; pkg_resources.require( "bx-python" )
 import bx.intervals.io
-from numpy import zeros
+from bx.bitset import BitSet
 from galaxy.tools.util import maf_utilities
 
 assert sys.version_info[:2] >= ( 2, 4 )
@@ -56,32 +56,35 @@
     #loop through interval file
     for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ) ):
         src = "%s.%s" % ( dbkey, region.chrom )
-        total_length += ( region.end - region.start )
-        coverage = { dbkey: zeros( region.end - region.start, dtype = bool ) }
+        region_length = region.end - region.start
+        total_length += region_length
+        coverage = { dbkey: BitSet( region_length ) }
         
         for block in maf_utilities.get_chopped_blocks_for_region( index, src, region, force_strand='+' ):
             #make sure all species are known
             for c in block.components:
                 spec = c.src.split( '.' )[0]
-                if spec not in coverage: coverage[spec] = zeros( region.end - region.start, dtype = bool )
+                if spec not in coverage: coverage[spec] = BitSet( region_length )
             start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start )
             for i in range( len( alignment[dbkey] ) ):
                 for spec, text in alignment.items():
                     if text[i] != '-':
-                        coverage[spec][start_offset + i] = True
+                        coverage[spec].set( start_offset + i )
         if summary:
             #record summary
             for key in coverage.keys():
                 if key not in species_summary: species_summary[key] = 0
-                species_summary[key] = species_summary[key] + sum( coverage[key] )
+                species_summary[key] = species_summary[key] + coverage[key].count_range()
         else:
             #print coverage for interval
-            out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, sum( coverage[dbkey] ), len(coverage[dbkey]) - sum( coverage[dbkey] ) ) )
+            coverage_sum = coverage[dbkey].count_range()
+            out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) )
             keys = coverage.keys()
             keys.remove( dbkey )
             keys.sort()
             for key in keys:
-                out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, sum( coverage[key] ), len(coverage[key]) - sum( coverage[key] ) ) )
+                coverage_sum = coverage[key].count_range()
+                out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) )
     if summary:
         out.write( "#species\tnucleotides\tcoverage\n" )
         for spec in species_summary:
diff -r fdbf15ea1f8a -r 4b9feffc3ce5 tools/maf/maf_stats.xml
--- a/tools/maf/maf_stats.xml Wed Oct 08 12:00:16 2008 -0400
+++ b/tools/maf/maf_stats.xml Thu Oct 09 11:23:33 2008 -0400
@@ -60,6 +60,13 @@
       <param name="mafType" value="8_WAY_MULTIZ_hg17"/>
       <output name="out_file1" file="maf_stats_interval_out.dat"/>
       <param name="summary" value="false"/>
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" format="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="mafType" value="8_WAY_MULTIZ_hg17"/>
+      <output name="out_file1" file="maf_stats_summary_out.dat"/>
+      <param name="summary" value="true"/>
     </test>
   </tests>
   <help>