Source code for ICGC_data_parser.ssm_reader

"""
Module with functionality to read the ICGC
simple somatic mutations file.
"""

import vcf
import re
from collections import namedtuple


class BufferedReader:
    """A wrapper over a file descriptor that adds buffering functionality."""
    def __init__(self, fdesc):
        self.file = fdesc
        self.buffer = []
    # ---
    
    def __getattr__(self, attr):
        return getattr(self.file, attr)
    # ---
    
    def push(self, line):
        self.buffer.append(line)
    # ---
    
    def __iter__(self):
        return self
    # ---
    
    def __next__(self):
        if self.buffer:
            return self.buffer.pop()
        return next(self.file)
    # ---
# --- BufferedReader


[docs]class SSM_Reader(vcf.Reader): """Reader class for the International Cancer Genome Consortium aggregate file of simple somatic mutations from the Data Releases. Example:: >>> reader = SSM_Reader(filename='data/ssm_sample.vcf') >>> for record in reader.parse(filters=['BRCA-EU']): ... print(record.ID, record.CHROM, record.POS) MU66865518 1 100141201 MU65487875 1 100160548 MU66281118 1 100638179 MU66254120 1 101352655 ... """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Add buffering self.reader = BufferedReader(self.reader) self.re_filters = [] # ---
[docs] def push_line(self, line): """Rebuffers line so that it is parsed next.""" self.reader.push(line)
# ---
[docs] def next_line(self): """Fetch the next raw line from the file.""" return next(self.reader)
# ---
[docs] def next_array(self, strict_whitespace=False): """Fetch the next line splitted into fields. If ``strict_whitespace`` is True, then split on tabs rather than whitespace. This allows for fields with spaces in them. """ return next(self.reader).split('\t')
# ---
[docs] def subfield_parser(self, sf_name, sep='|'): """Get a parser for the items of the subfield. Useful to parse the CONSEQUENCE and OCCURRENCE subfields of the INFO field. Example:: >>> reader = SSM_Reader(filename='data/ssm_sample.vcf') >>> CONSEQUENCE = reader.subfield_parser('CONSEQUENCE') >>> for record in reader.parse(filters=['BRCA-EU']): ... # Which genes are affected? ... print(CONSEQUENCE(record)[0].gene_symbol) SLC27A3 GATAD2B TPM3 SHE ADAM15 ... """ # Get the description of the subfield sf_info = self.infos[sf_name] # Get the field id field_id = sf_info.id # Get the subfields names subfields_str = re.findall("\(subfields: (.*?)\)", sf_info.desc)[0] subfields = subfields_str.split(sep) # Create the structure field_struct = namedtuple(field_id, subfields) # Create parser def parse(record): # Parse the field items return [field_struct(*item.split(sep)) for item in record.INFO[field_id] if item] parse.field_id = sf_info.id parse.subfields = subfields return parse
# ---
[docs] def iter_lines(self, filters=None): """Iterate through the file's raw lines, filtering out the ones not matching the regular expressions given. """ if filters is None: filters = [] # Compile filters for faster lookup filters = [re.compile(regex) for regex in filters if regex is not None] for line in self.reader: if all(filter_.search(line) for filter_ in filters): # The line passes all filters yield line
# ---
[docs] def parse(self, filters=None): """Iterate through the records of the file, filtering out the lines that do not match the regular expressions given. Example:: >>> reader = SSM_Reader(filename='data/ssm_sample.vcf') >>> for record in reader.parse(filters=['BRCA-EU']): ... print(record.ID) MU66865518 MU65487875 MU66281118 MU66254120 ... """ for line in self.iter_lines(filters=filters): # The parser reads the record from # self.reader, so, we must rebuffer # the line to parse it. self.reader.push(line) yield next(self)
# --- # SSM_Reader