Source code for ICGC_data_parser.ssm_reader

"""
Module with functionality to read the ICGC
simple somatic mutations file.
"""

import vcf
import re
from collections import namedtuple


class BufferedReader:
    """A wrapper over a file descriptor that adds buffering functionality."""
    def __init__(self, fdesc):
        self.file = fdesc
        self.buffer = []
    # ---
    
    def __getattr__(self, attr):
        return getattr(self.file, attr)
    # ---
    
    def push(self, line):
        self.buffer.append(line)
    # ---
    
    def __iter__(self):
        return self
    # ---
    
    def __next__(self):
        if self.buffer:
            return self.buffer.pop()
        return next(self.file)
    # ---
# --- BufferedReader


[docs]class SSM_Reader(vcf.Reader):
    """Reader class for the International Cancer Genome 
    Consortium aggregate file of simple somatic mutations
    from the Data Releases.
    
    Example::
        
            >>> reader = SSM_Reader(filename='data/ssm_sample.vcf')

            >>> for record in reader.parse(filters=['BRCA-EU']):
            ...    print(record.ID, record.CHROM, record.POS)
            MU66865518 1 100141201
            MU65487875 1 100160548
            MU66281118 1 100638179
            MU66254120 1 101352655
                ...
    """
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        # Add buffering 
        self.reader = BufferedReader(self.reader)
        self.re_filters = []
    # --- 
    
[docs]    def push_line(self, line):
        """Rebuffers line so that it is parsed next."""
        self.reader.push(line)
    # ---
    
[docs]    def next_line(self):
        """Fetch the next raw line from the file."""
        return next(self.reader)
    # ---
    
[docs]    def next_array(self, strict_whitespace=False):
        """Fetch the next line splitted into fields.
        
        If ``strict_whitespace`` is True, then split on tabs rather than 
        whitespace. This allows for fields with spaces in them.
        """
        return next(self.reader).split('\t')
    # ---
    
[docs]    def subfield_parser(self, sf_name, sep='|'):
        """Get a parser for the items of the subfield.
        
        Useful to parse the CONSEQUENCE and OCCURRENCE subfields
        of the INFO field.
        
        Example::
        
            >>> reader = SSM_Reader(filename='data/ssm_sample.vcf')
            
            >>> CONSEQUENCE = reader.subfield_parser('CONSEQUENCE')

            >>> for record in reader.parse(filters=['BRCA-EU']):
            ...    # Which genes are affected?
            ...    print(CONSEQUENCE(record)[0].gene_symbol)
            SLC27A3
            GATAD2B
            TPM3
            SHE
            ADAM15
              ...    
        """
        # Get the description of the subfield
        sf_info = self.infos[sf_name]
        
        # Get the field id
        field_id = sf_info.id

        # Get the subfields names
        subfields_str = re.findall("\(subfields: (.*?)\)", sf_info.desc)[0]
        subfields = subfields_str.split(sep)

        # Create the structure
        field_struct = namedtuple(field_id, subfields)

        # Create parser
        def parse(record):
            # Parse the field items
            return [field_struct(*item.split(sep)) 
                        for item in record.INFO[field_id]
                        if item]

        parse.field_id = sf_info.id
        parse.subfields = subfields
        return parse
    # ---
    
[docs]    def iter_lines(self, filters=None):
        """Iterate through the file's raw lines, filtering out the ones not 
        matching the regular expressions given.  
        """
        if filters is None:
            filters = []
            
        # Compile filters for faster lookup
        filters = [re.compile(regex) 
                       for regex in filters 
                       if regex is not None]
        
        for line in self.reader:
            if all(filter_.search(line) for filter_ in filters):
                   # The line passes all filters
                   yield line
    # ---
                   
[docs]    def parse(self, filters=None):
        """Iterate through the records of the file, 
        filtering out the lines that do not match the 
        regular expressions given.
        
        Example::
        
            >>> reader = SSM_Reader(filename='data/ssm_sample.vcf')

            >>> for record in reader.parse(filters=['BRCA-EU']):
            ...    print(record.ID)
            MU66865518
            MU65487875
            MU66281118
            MU66254120
                ...
            
        """
        for line in self.iter_lines(filters=filters):
            # The parser reads the record from
            # self.reader, so, we must rebuffer 
            # the line to parse it.
            self.reader.push(line)
            yield next(self)
    # ---
# SSM_Reader
Source code for ICGC_data_parser.ssm_reader

ICGC Data Parser

Navigation

Related Topics