Module mixture
[hide private]
[frames] | no frames]

Source Code for Module mixture

   1  ################################################################################ 
   2  #  
   3  #       This file is part of the Python Mixture Package 
   4  # 
   5  #       file:    mixture.py  
   6  #       author: Benjamin Georgi  
   7  #   
   8  #       Copyright (C) 2004-2009 Benjamin Georgi 
   9  #       Copyright (C) 2004-2009 Max-Planck-Institut fuer Molekulare Genetik, 
  10  #                               Berlin 
  11  # 
  12  #       Contact: georgi@molgen.mpg.de 
  13  # 
  14  #       This library is free software; you can redistribute it and/or 
  15  #       modify it under the terms of the GNU Library General Public 
  16  #       License as published by the Free Software Foundation; either 
  17  #       version 2 of the License, or (at your option) any later version. 
  18  # 
  19  #       This library is distributed in the hope that it will be useful, 
  20  #       but WITHOUT ANY WARRANTY; without even the implied warranty of 
  21  #       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  22  #       Library General Public License for more details. 
  23  # 
  24  #       You should have received a copy of the GNU Library General Public 
  25  #       License along with this library; if not, write to the Free 
  26  #       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
  27  # 
  28  # 
  29  # 
  30  ################################################################################ 
  31   
  32  """    
  33    
  34  PyMix - Python Mixture Package 
  35   
  36  The PyMix library implements algorithms and data structures for data mining  
  37  with finite mixture models. The framework is object oriented and  
  38  organized in a hierarchical fashion. 
  39    
  40   
  41   
  42  """ 
  43  import random 
  44  import math 
  45  import copy 
  46  from alphabet import Alphabet,IntegerRange 
  47  from string import split,replace,join 
  48  #from time import time  # for RNG intialization 
  49  import numpy 
  50  from numpy import linalg as la 
  51  import setPartitions 
  52  import sys 
  53  import re 
  54  import cStringIO, tokenize 
  55   
  56  import _C_mixextend  # import C extension 
  57   
  58  #import time # for debuging 
  59   
  60   
  61  # ToDo: use logging package for verbosity control, makes all 'silent' parameters superfluous, 
  62  # needs to trickle down into the whole library  
  63  import logging 
  64  log = logging.getLogger("PyMix") 
  65   
  66  # creating StreamHandler to stderr 
  67  hdlr = logging.StreamHandler(sys.stderr) 
  68   
  69  # setting message format 
  70  #fmt = logging.Formatter("%(name)s %(asctime)s %(filename)s:%(lineno)d %(levelname)s %(thread)-5s - %(message)s") 
  71  fmt = logging.Formatter("%(name)s %(filename)s:%(lineno)d - %(message)s") 
  72  hdlr.setFormatter(fmt) 
  73   
  74  # adding handler to logger object 
  75  log.addHandler(hdlr) 
  76   
  77  # Set the minimal severity of a message to be shown. The levels in 
  78  # increasing severity are: DEBUG, INFO, WARNING, ERROR, CRITICAL 
  79  log.setLevel(logging.ERROR) 
  80   
  81   
  82   
  83  # By default numpy produces a warning whenever we call  numpy.log with an array 
  84  # containing zero values. Usually this will happen a lot, so we change  numpys error handling 
  85  # to ignore this error. Since  numpy.log returns -inf for zero arguments the computations run 
  86  # through just fine. 
  87  numpy.seterr(divide="ignore",invalid="raise") 
  88   
  89   
  90  # XXX TEST for decision bounds 
  91  import numpy.numarray.mlab 
  92   
  93   
94 -class DataSet:
95 """ 96 Class DataSet is the central data object. 97 """
98 - def __init__(self):
99 """ 100 Creates and returns an empty DataSet object 101 """ 102 self.N = None # number of samples 103 self.p = None # number of dimensions 104 self.seq_p = None # number of GHMM sequence features 105 self.complex = None 106 107 self.sampleIDs = [] # unique sample ids, row label 108 self.headers = [] # label for each column 109 self.dataMatrix = [] 110 111 # attributes for internal data representation by sufficient statistics 112 # these attributes are context specific (e.g. they depend on the MixtureModel) 113 # and are initialised by the internalInit method. 114 self.internalData = None 115 116 self._internalData_views = None # list of feature-wise views on internalData 117 118 self.suff_dataRange = None 119 self.suff_p = None 120 self.suff_p_list = [] 121 122 123 # for each feature we can define a symbol (or value in case of continuous features) 124 # to represent missing values in the data. This is used for instance in modelInitialization 125 # to prevent the placeholders for missing data to influence the initial parameters. 126 self.missingSymbols = {}
127 128
129 - def __len__(self):
130 """ 131 Returns the number of samples in the DataSet. 132 133 @return: Number of samples in the DataSet. 134 """ 135 return self.N
136 137
138 - def __copy__(self):
139 """ 140 Interface to copy.copy function. 141 142 @return: deep copy of 'self' 143 """ 144 145 cop = DataSet() 146 147 cop.N = self.N 148 cop.p = self.p 149 cop.sampleIDs = copy.copy(self.sampleIDs) 150 cop.headers = copy.copy(self.headers) 151 cop.dataMatrix = copy.deepcopy(self.dataMatrix) 152 cop.internalData = copy.deepcopy(self.internalData) 153 154 cop._internalData_views = copy.deepcopy(self._internalData_views) # XXX 155 156 cop.suff_dataRange = copy.copy(self.suff_dataRange) 157 cop.suff_p = self.suff_p 158 cop.suff_p_list = copy.deepcopy(self.suff_p_list) 159 return cop
160 161
162 - def fromArray(self,array, IDs = None, col_header = None):
163 """ 164 Initializes the data set from a 'numpy' object. 165 166 @param array: 'numpy' object containing the data 167 @param IDs: sample IDs (optional) 168 @param col_header: feature headers (optional) 169 """ 170 171 self.complex = 0 # DataSet is not complex 172 self.seq_p = 0 173 174 self.N = len(array) 175 try: 176 self.p = len(array[0]) 177 except TypeError: # if len() raises an exception array[0] is not a list -> p = 1 178 self.p = 1 179 180 if not IDs: 181 self.sampleIDs = range(self.N) 182 else: 183 self.sampleIDs = IDs 184 185 if not col_header: 186 self.headers = range(self.p) 187 else: 188 self.headers = col_header 189 190 self.dataMatrix = array.tolist()
191 192
193 - def fromList(self,List, IDs = None, col_header = None):
194 """ 195 Initializes the data set from a Python list. 196 197 @param List: Python list containing the data 198 @param IDs: sample IDs (optional) 199 @param col_header: feature headers (optional) 200 """ 201 self.complex = 0 # DataSet is not complex 202 self.seq_p = 0 203 204 self.N = len(List) 205 try: 206 self.p = len(List[0]) 207 except TypeError: # if len() raises an exception array[0] is not a list -> p = 1 208 self.p = 1 209 210 if IDs: 211 self.sampleIDs = IDs 212 else: 213 self.sampleIDs = range(self.N) 214 if col_header: 215 self.headers = col_header 216 else: 217 self.headers = range(self.p) 218 219 self.dataMatrix = List
220 221
222 - def fromFiles(self, fileNames, sep = "\t", missing="*",fileID = None, IDheader=False, IDindex=None):
223 """ 224 Initializes the data set from a list of data flat files. 225 226 @param fileNames: list of data flat files 227 @param sep: separator string between values in flat files, tab is default 228 @param missing: symbol for missing data '*' is default 229 @param fileID: optional prefix for all features in the file 230 @param IDheader: flag whether the sample ID column has a header in the first line of the flat files 231 @param IDindex: index where the sample ids can be found, 0 by default 232 """ 233 234 if IDindex == None: 235 IDindex = [0] * len(fileNames) 236 237 self.complex = 0 # DataSet is not complex 238 self.seq_p = 0 239 240 splitter = re.compile(sep) 241 242 # store data in dictionary with sampleIDs as keys 243 data_dict = {} 244 data_nrs = [0] * len(fileNames) 245 for q,fname in enumerate(fileNames): 246 f = open(fname,"r") 247 248 # parse header line 249 l1 = f.next().rstrip() 250 251 l1 = replace(l1,'"','') # remove " characters, if present 252 253 # split at separation characters 254 #list1= split(l1,sep) 255 list1 = splitter.split(l1) 256 257 # prepending file identifier to column labels 258 if fileID: 259 print "File ident: ", fileID 260 for i in range(len(list1)): 261 list1[i] = str(fileID)+"-"+str(list1[i]) 262 263 print fname,":",len(list1),"features" 264 265 #print list1 266 267 if IDheader == True: # remove header for sample ID column, if present 268 tt = list1.pop(IDindex[q]) 269 print tt 270 #print list1 271 272 data_nrs[q] = len(list1) 273 self.headers = self.headers + list1 274 275 276 277 for h,line in enumerate(f): 278 line = chomp(line) 279 line = replace(line,'"','') # remove " characters, if present 280 281 # cast data values into numerical types if possible 282 sl = splitter.split(line) 283 284 l = numerize(sl) 285 sid = l.pop(IDindex[q]) 286 287 #print '->',sid 288 289 if len(l) != data_nrs[q]: 290 print l 291 print list1 292 raise RuntimeError, "Different numbers of headers and data columns in files " + str(fname)+", sample "+str(sid) +" ,"+str(len(l))+" != "+str(data_nrs[q]) 293 294 if not data_dict.has_key(sid): 295 data_dict[sid] = {} 296 data_dict[sid][fname] = l 297 298 else: 299 data_dict[sid][fname] = l 300 301 # assembling data set from data dictionary 302 for k in data_dict: 303 self.sampleIDs.append(k) 304 citem = [] 305 for q,fname in enumerate(fileNames): 306 if data_dict[k].has_key(fname): 307 citem += data_dict[k][fname] 308 else: 309 incomplete = 1 310 print "Warning: No data for sample "+str(k)+" in file "+str(fname)+"." 311 citem += [missing] * data_nrs[q] 312 313 self.dataMatrix.append(citem) 314 315 self.N = len(self.dataMatrix) 316 # checking data-label consistency 317 for i in range(self.N): 318 assert len(self.dataMatrix[i]) == len(self.headers), "Different numbers of headers and data columns in files " + str(fileNames)+", sample "+str(self.sampleIDs[i]) +" ,"+str(len(self.dataMatrix[i])) +" != "+ str( len(self.headers)) 319 320 self.p = len(self.dataMatrix[0])
321 322
323 - def __str__(self):
324 """ 325 String representation of the DataSet 326 327 @return: string representation 328 """ 329 strout = "Data set overview:\n" 330 strout += "N = "+ str(self.N) + "\n" 331 strout += "p = "+str(self.p) + "\n\n" 332 strout += "sampleIDs = " + str(self.sampleIDs)+ "\n\n" 333 strout += "headers = "+ str(self.headers)+ "\n\n" 334 #strout += "dataMatrix = "+str(self.dataMatrix) + "\n" 335 336 return strout
337
338 - def printClustering(self,c,col_width=None):
339 """ 340 Pretty print of a clustering . 341 342 @param c: numpy array of integer cluster labels for each sample 343 @param col_width: column width in spaces (optional) 344 345 """ 346 if self.complex: 347 raise NotImplementedError, "Needs to be done..." 348 349 # get number of clusters in 'c' 350 d = {} 351 for lab in c: 352 if lab == -1: # unassigned samples are handled seperately below 353 continue 354 d[lab]= "" 355 G = len(d.keys()) 356 357 max_h = 0 358 for h in self.headers: 359 if len(str(h)) > max_h: 360 max_h = len(str(h)) 361 max_sid = 0 362 for s in self.sampleIDs: 363 if len(str(s)) > max_sid: 364 max_sid = len(str(s)) 365 if not col_width: 366 space = max_h+2 367 else: 368 space = col_width 369 370 for i in d: 371 t = numpy.where(c == i) 372 index = t[0] 373 print "\n----------------------------------- cluster ",i,"------------------------------------" 374 print ' ' * (max_sid+3), 375 for k in range(len(self.headers)): 376 hlen = len(str(self.headers[k])) 377 print str(self.headers[k])+ " " * (space-hlen), 378 print 379 for j in range(len(index)): 380 print '%-*s' % ( max_sid+3, self.sampleIDs[index[j]]), 381 for k in range(len(self.dataMatrix[index[j]])): 382 dlen = len(str(self.dataMatrix[index[j]][k])) 383 print str(self.dataMatrix[index[j]][k] ) + " " * (space-dlen), 384 print 385 386 t = numpy.where(c == -1) 387 index = t[0] 388 if len(index) > 0: 389 print "\n----------- Unassigned ----------------" 390 space = max_h+2 391 print ' ' * (max_sid+3), 392 for k in range(len(self.headers)): 393 hlen = len(str(self.headers[k])) 394 print self.headers[k]+ " " * (space-hlen), 395 print 396 for j in range(len(index)): 397 print '%-*s' % ( max_sid+3, self.sampleIDs[index[j]]), 398 for k in range(len(self.dataMatrix[index[j]])): 399 dlen = len(str(self.dataMatrix[index[j]][k])) 400 print str(self.dataMatrix[index[j]][k] ) + " " * (space-dlen), 401 print
402
403 - def internalInit(self,m):
404 """ 405 Initializes the internal representation of the data 406 used by the EM algorithm . 407 408 @param m: MixtureModel object 409 """ 410 assert m.p == self.p,"Invalid dimensions in data and model."+str(m.p)+' '+str(self.p) 411 412 templist = [] 413 for i in range(len(self.dataMatrix)): 414 try: 415 [t,dat] = m.components[0].formatData(self.dataMatrix[i]) 416 except InvalidDistributionInput, ex: 417 ex.message += ' ( Sample '+str(self.sampleIDs[i])+', index = '+str(i)+' )' 418 raise 419 420 templist.append(dat) 421 422 self.internalData = numpy.array(templist,dtype='Float64') 423 424 if m.dist_nr > 1: 425 self.suff_dataRange = copy.copy(m.components[0].suff_dataRange) 426 else: 427 self.suff_dataRange = [m.suff_p] 428 429 self.suff_p = m.components[0].suff_p 430 431 self._internalData_views = [] 432 for i in range(m.components[0].dist_nr): 433 self.suff_p_list.append(m.components[0][i].suff_p) # XXX suff_p_list should go away, only need in singleFeatureSubset 434 if i == 0: 435 prev_index = 0 436 else: 437 prev_index = self.suff_dataRange[i-1] 438 439 this_index = self.suff_dataRange[i] 440 if self.p == 1: # only a single feature 441 self._internalData_views.append(self.internalData) 442 else: 443 self._internalData_views.append( self.internalData[:,prev_index:this_index ] )
444
445 - def getInternalFeature(self, i):
446 """ 447 Returns the columns of self.internalData containing the data of the feature with index 'i' 448 449 @param i: feature index 450 @return: numpy containing the data of feature 'i' 451 """ 452 #assert self.suff_dataRange is not None,'DataSet needs to be initialized with .internalInit()' 453 if i < 0 or i >= len(self.suff_dataRange): 454 raise IndexError, "Invalid index " + str(i) 455 456 return self._internalData_views[i]
457
458 - def removeFeatures(self, ids,silent = 0):
459 """ 460 Remove a list of features from the data set. 461 462 @param ids: list of feature identifiers 463 @param silent: verbosity control 464 """ 465 # removing columns from data matrix 466 for i in ids: 467 try: 468 ind = self.headers.index(i) 469 except ValueError: 470 sys.stderr.write("\nERROR: Feature ID "+str(i)+ " not found.\n") 471 raise 472 473 for k in range(self.N): 474 self.dataMatrix[k].pop(ind) 475 476 r = self.headers.pop(ind) 477 self.p -= 1 478 if not silent: 479 print "Feature "+str(r)+" has been removed."
480
481 - def removeSamples(self, ids,silent = 0):
482 """ 483 Remove a list of samples from the data set. 484 485 @param ids: list of sample identifiers 486 @param silent: verbosity control 487 """ 488 if self.internalData: 489 print "Warning: internalInit has to be rerun after removeSamples." 490 self.internalData = None 491 self.suff_dataRange = None 492 self.suff_p = None 493 self.suff_p_list = None 494 495 for si in ids: 496 sind = self.sampleIDs.index(si) 497 self.dataMatrix.pop(sind) 498 self.sampleIDs.pop(sind) 499 500 if not silent: 501 print 'Samples '+str(ids)+' removed' 502 503 self.N = self.N - len(ids)
504 505
506 - def filterSamples(self,fid,min_value,max_value):
507 """ 508 Removes all samples with values < 'min_value' or > 'max_value' in feature 'fid'. 509 510 @param fid: feature ID in self.headers 511 @param min_value: minimal required value 512 @param max_value: maximal required value 513 514 """ 515 if self.internalData: 516 print "Warning: internalInit has to be rerun after removeSamples." 517 self.internalData = None 518 self.suff_dataRange = None 519 self.suff_p = None 520 self.suff_p_list = None 521 522 ind = self.headers.index(fid) 523 524 print "Removing samples with "+fid +" < "+str(min_value)+" or > "+str(max_value)+" ...", 525 i = 0 # current index in dataMatrix 526 c = 0 # number of samples already considered 527 r = 0 # number of removed samples 528 while c < self.N: 529 if self.dataMatrix[i][ind] < min_value or self.dataMatrix[i][ind] > max_value: 530 # remove sample 531 self.dataMatrix.pop(i) 532 self.sampleIDs.pop(i) 533 c += 1 534 r += 1 535 else: 536 i += 1 537 c += 1 538 539 print str(r)+" samples removed" 540 self.N = self.N - r
541 542
543 - def maskDataSet(self, valueToMask, maskValue, silent=False):
544 """ 545 Allows the masking of a value with another in the entire data matrix. 546 547 @param valueToMask: value to be masked 548 @param maskValue: value which is to be substituted 549 @param silent: verbosity control (False is default) 550 """ 551 count = 0 552 for i in range(self.N): 553 for j in range(self.p): 554 if self.dataMatrix[i][j] == valueToMask: 555 self.dataMatrix[i][j] = maskValue 556 count += 1 557 558 if not silent: 559 print str(count), "values '"+str(valueToMask)+"' masked with '"+str(maskValue)+"' in all features."
560 561
562 - def maskFeatures(self, headerList, valueToMask, maskValue):
563 """ 564 Equivalent to maskDataSet but constrained to a subset of features 565 566 @param headerList: list of features IDs 567 @param valueToMask: value to be masked 568 @param maskValue: value which is to be substituted 569 """ 570 count = 0 571 for h in headerList: 572 try: 573 ind = self.headers.index(h) 574 except ValueError: 575 sys.stderr.write("\nERROR: Feature ID "+str(h)+ " not found.\n") 576 raise 577 578 for j in range(self.N): 579 if str(self.dataMatrix[j][ind]) == str(valueToMask): 580 self.dataMatrix[j][ind] = maskValue 581 count += 1 582 583 print str(count), "values '"+str(valueToMask)+"' masked with '"+str(maskValue)+"' in "+str(len(headerList))+" features."
584 585 586
587 - def getExternalFeature(self, fid):
588 """ 589 Returns the external data representation of a given feature 590 591 @param fid: feature ID in self.headers 592 593 @return: list of data samples for feature fid 594 """ 595 index = self.headers.index(fid) 596 res = [] 597 for i in range(self.N): 598 res.append(self.dataMatrix[i][index]) 599 600 return res
601
602 - def extractSubset(self,ids):
603 """ 604 Remove all samples in 'ids' from 'self' and return a new DataSet initialised with these samples 605 606 @param ids: list of sample indices 607 608 @return: DataSet object containing the samples in ids 609 """ 610 res = DataSet() 611 612 res.N = len(ids) 613 res.p = self.p 614 res.suff_dataRange = copy.copy(self.suff_dataRange) 615 res.suff_p = self.suff_p 616 res.suff_p_list = self.suff_p_list 617 res.sampleIDs = ids 618 res.headers = copy.copy(self.headers) 619 620 if self.internalData is not None: 621 res.internalData = numpy.zeros((res.N,res.suff_p),self.internalData.type()) 622 623 else: 624 res.internalData = None 625 626 #remove subset entries from self.internalData 627 new_intData = None 628 if self.internalData is not None: 629 new_intData = numpy.zeros(( (self.N-res.N),res.suff_p),self.internalData.type()) 630 new_sampleIDs = [] 631 ni = 0 632 for i in range(self.N): 633 if self.sampleIDs[i] not in ids: 634 new_intData[ni] = self.internalData[i] 635 ni+=1 636 637 for i,d in enumerate(ids): 638 ind = self.sampleIDs.index(d) 639 640 dat = self.dataMatrix.pop(ind) 641 res.dataMatrix.append(dat) 642 643 # fill internalData matrix 644 if self.internalData is not None: 645 res.internalData[i] = copy.deepcopy(self.internalData[ind]) 646 647 self.sampleIDs.pop(ind) 648 649 650 self.internalData = new_intData 651 self.N -= res.N 652 653 return res
654
655 - def singleFeatureSubset(self, index):
656 """ 657 Returns a DataSet for the feature with internal index 'index' in 'self'. 658 For internal use. 659 660 @param index: feature index 661 662 @return: DataSet object 663 """ 664 res = DataSet() 665 666 res.N = self.N # number of samples 667 res.p = 1 # number of dimensions 668 res.seq_p = self.seq_p # number of GHMM sequence features 669 res.suff_p = self.suff_p_list[index] 670 res.suff_p_list = [ self.suff_p_list[index] ] 671 res.internalData = self.getInternalFeature(index) 672 673 res._internalData_views = [self._internalData_views[index]] # XXX 674 675 if self.headers: 676 res.headers.append(self.headers[index]) 677 if self.sampleIDs: 678 res.sampleIDs = self.sampleIDs 679 680 res.missingSymbols = {} 681 if self.missingSymbols.has_key(index): 682 res.missingSymbols[0] = self.missingSymbols[index] 683 684 res.suff_dataRange = [self.suff_dataRange[index]-self.suff_dataRange[index-1]] 685 return res
686
687 - def setMissingSymbols(self, findices, missing):
688 """ 689 Assigns missing value placeholders to features. 690 691 @param findices: list of internal feature indices 692 @param missing: list of missing symbols/values 693 """ 694 assert len(findices) == len( missing) 695 for i,h in enumerate(findices): 696 self.missingSymbols[h] = missing[i]
697
698 - def getMissingIndices(self, ind):
699 """ 700 Get indices of missing values in one feature 701 702 @param ind: feature index 703 704 @return: list of indices of missing values 705 """ 706 assert self.suff_dataRange is not None 707 708 if not self.missingSymbols.has_key(ind): 709 return [] 710 else: 711 m_ind = [] 712 dat_ind = self.getInternalFeature(ind) 713 for i,v in enumerate(dat_ind): 714 # check sample 'i' for missing symbol 715 if numpy.all(v == self.missingSymbols[ind]): 716 m_ind.append(i) 717 return m_ind
718 719
720 - def writeClusteringFasta(self,fn_pref,m):
721 """ 722 Writes a clustering based on model 'm' into files in FASTA format. 723 Note that this implies sequence data. 724 725 @param fn_pref: Filename prefix. The full name of each output file consists 726 of the prefix, the cluster number and the extension .fa 727 @param m: MixtureModel object 728 """ 729 c = m.classify(self,silent=1) 730 731 # get number of clusters in 'c' 732 d = {} 733 for a in c: 734 d[a]= "" 735 G = len(d) 736 737 # write data to file 738 for k in d: 739 t = numpy.where(c == k) 740 index = t[0] 741 f = open(fn_pref+'_'+str(k)+'.fa','w') 742 for i in index: 743 f.write(">"+str(self.sampleIDs[i])+"\n") 744 s = join(self.dataMatrix[i],'') # convert to sequence 745 f.write(s+"\n") 746 f.close()
747 748 749
750 -def numerize(data):
751 """ 752 Cast all elements in a list to numeric values. 753 754 @param data: list of data 755 756 @return: list of processed data 757 """ 758 759 for i in range(len(data)): 760 try: 761 data[i] = int(data[i]) 762 763 except ValueError: 764 try: 765 data[i] = float(data[i]) 766 except ValueError: 767 pass 768 return data
769
770 -def remove_col(matrix, index):
771 """ 772 Removes a column in a Python matrix (list of lists) 773 774 @param matrix: Python list of lists 775 @param index: index of column to be removed 776 777 @return: matrix with column deleted 778 """ 779 for i in range(len(matrix)): 780 del(matrix[i][index]) 781 return matrix
782 783 784 785 #------------------------------------------------------------------------------------------- 786 787
788 -class MixtureError(Exception):
789 """Base class for mixture exceptions."""
790 - def __init__(self, message):
791 self.message = message
792 - def __str__(self):
793 return str(self.message)
794
795 -class InvalidPosteriorDistribution(MixtureError):
796 """ 797 Raised if an invalid posterior distribution occurs. 798 """
799 - def __init__(self,message):
800 self.message = message
801 - def __str__(self):
802 return str(self.message)
803
804 -class InvalidDistributionInput(MixtureError):
805 """ 806 Raised if a DataSet is found to be incompatible with a given MixtureModel. 807 """
808 - def __init__(self,message):
809 self.message = message
810 - def __str__(self):
811 return str(self.message)
812
813 -class ConvergenceFailureEM(MixtureError):
814 """ 815 Raised if a DataSet is found to be incompatible with a given MixtureModel. 816 """
817 - def __init__(self,message):
818 self.message = message
819 - def __str__(self):
820 return str(self.message)
821 822 823 824 825 #-------------------------------------------------------------------------------------------- 826
827 -class ProbDistribution:
828 """ 829 Base class for all probability distributions. 830 """
831 - def __init__(self):
832 """ 833 Constructor 834 """ 835 pass
836
837 - def __eq__(self,other):
838 """ 839 Interface for the '==' operation 840 841 @param other: object to be compared 842 """ 843 raise NotImplementedError
844
845 - def __str__(self):
846 """ 847 String representation of the DataSet 848 849 @return: string representation 850 """ 851 raise NotImplementedError
852
853 - def __copy__(self):
854 "Interface for the copy.copy function" 855 raise NotImplementedError
856
857 - def pdf(self,data):
858 """ 859 Density function. 860 MUST accept either numpy or DataSet object of appropriate values. We use numpys as input 861 for the atomar distributions for efficiency reasons (The cleaner solution would be to construct 862 DataSet subset objects for the different features and we might switch over to doing that eventually). 863 864 @param data: DataSet object or numpy array 865 866 @return: log-value of the density function for each sample in 'data' 867 """ 868 raise NotImplementedError
869 870
871 - def MStep(self,posterior,data,mix_pi=None):
872 """ 873 Maximization step of the EM procedure. Reestimates the distribution parameters 874 using the posterior distribution and the data. 875 876 MUST accept either numpy or DataSet object of appropriate values. numpys are used as input 877 for the atomar distributions for efficiency reasons 878 879 @param posterior: posterior distribution of component membership 880 @param data: DataSet object or 'numpy' of samples 881 @param mix_pi: mixture weights, necessary for MixtureModels as components. 882 883 """ 884 raise NotImplementedError
885 886
887 - def sample(self):
888 """ 889 Samples a single value from the distribution. 890 891 @return: sampled value 892 """ 893 raise NotImplementedError, "Needs implementation"
894
895 - def sampleSet(self,nr):
896 """ 897 Samples several values from the distribution. 898 899 @param nr: number of values to be sampled. 900 901 @return: sampled values 902 """ 903 raise NotImplementedError, "Needs implementation"
904
905 - def sufficientStatistics(self, posterior, data):
906 """ 907 Returns sufficient statistics for a given data set and posterior. 908 909 @param posterior: numpy vector of component membership posteriors 910 @param data: numpy vector holding the data 911 912 @return: list with dot(posterior, data) and dot(posterior, data**2) 913 """ 914 raise NotImplementedError, "Needs implementation"
915 916
917 - def isValid(self,x):
918 """ 919 Checks whether 'x' is a valid argument for the distribution and raises InvalidDistributionInput 920 exception if that is not the case. 921 922 @param x: single sample in external representation, i.e.. an entry of DataSet.dataMatrix 923 924 @return: True/False flag 925 """ 926 raise NotImplementedError
927
928 - def formatData(self,x):
929 """ 930 Formats samples 'x' for inclusion into DataSet object. Used by DataSet.internalInit() 931 932 @param x: list of samples 933 934 @return: two element list: first element = dimension of self, second element = sufficient statistics for samples 'x' 935 """ 936 return [self.p,x]
937 938
939 - def flatStr(self,offset):
940 """ 941 Returns the model parameters as a string compatible 942 with the WriteMixture/ReadMixture flat file 943 format. 944 945 @param offset: number of '\t' characters to be used in the flatfile. 946 """ 947 raise NotImplementedError
948
949 - def posteriorTraceback(self,x):
950 """ 951 Returns the decoupled posterior distribution for each 952 sample in 'x'. Used for analysis of clustering results. 953 954 @param x: list of samples 955 956 @return: decoupled posterior 957 """ 958 raise NotImplementedError
959
960 - def update_suff_p(self):
961 """ 962 Updates the .suff_p field. 963 """ 964 return self.suff_p
965
966 - def merge(self,dlist,weights):
967 """ 968 Merges 'self' with the distributions in'dlist' by an 969 convex combination of the parameters as determined by 'weights' 970 971 @param dlist: list of distribution objects of the same type as 'self' 972 @param weights: list of weights, need not to sum up to one 973 """ 974 raise NotImplementedError
975 976
977 -class PriorDistribution(ProbDistribution):
978 """ 979 Prior distribution base class for the Bayesian framework 980 """
981 - def pdf(self, m):
982 """ 983 Returns the log-density of the ProbDistribution object(s) 'm' under the 984 prior. 985 986 @param m: single appropriate ProbDistribution object or list of ProbDistribution objects 987 """ 988 raise NotImplementedError, "Needs implementation"
989
990 - def posterior(self,m,x):
991 raise NotImplementedError, "Needs implementation" 992
993 - def marginal(self,x):
994 raise NotImplementedError, "Needs implementation"
995
996 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
997 """ 998 Maximization step of the maximum aposteriori EM procedure. Reestimates the distribution parameters 999 of argument 'dist' using the posterior distribution, the data and a conjugate prior. 1000 1001 MUST accept either numpy or DataSet object of appropriate values. numpys are used as input 1002 for the atomar distributions for efficiency reasons. 1003 1004 @param dist: distribution whose parameters are to be maximized 1005 @param posterior: posterior distribution of component membership 1006 @param data: DataSet object or 'numpy' of samples 1007 @param mix_pi: mixture weights, necessary for MixtureModels as components. 1008 @param dist_ind: optional index of 'dist', necessary for ConditionalGaussDistribution.mapMStep (XXX) 1009 """ 1010 raise NotImplementedError 1011 1012
1013 - def mapMStepMerge(self, group_list):
1014 """ 1015 Computes the MAP parameter estimates for a candidate merge in the structure 1016 learning based on the information of two CandidateGroup objects. 1017 1018 @param group_list: list of CandidateGroup objects 1019 @return: CandidateGroup object with MAP parameters 1020 """ 1021 raise NotImplementedError
1022
1023 - def mapMStepSplit(self, toSplitFrom, toBeSplit):
1024 """ 1025 Computes the MAP parameter estimates for a candidate merge in the structure 1026 learning based on the information of two CandidateGroup objects. 1027 1028 @return: CandidateGroup object with MAP parameters 1029 1030 """ 1031 raise NotImplementedError
1032
1033 - def updateHyperparameters(self, dists, posterior, data):
1034 """ 1035 Update the hyperparameters in an empirical Bayes fashion. 1036 1037 @param dists: list of ProbabilityDistribution objects 1038 @param posterior: numpy matrix of component membership posteriors 1039 @param data: DataSet object 1040 """ 1041 raise NotImplementedError 1042 1043 #------------------------------------------------------------------------------------------------------ 1044
1045 -class NormalDistribution(ProbDistribution):
1046 """ 1047 Univariate Normal Distribution 1048 1049 """ 1050
1051 - def __init__(self, mu ,sigma):
1052 """ 1053 Constructor 1054 1055 @param mu: mean parameter 1056 @param sigma: standard deviation parameter 1057 """ 1058 self.p = 1 1059 self.suff_p = 1 1060 self.mu = mu 1061 self.sigma = sigma 1062 1063 self.freeParams = 2 1064 1065 self.min_sigma = 0.1 # minimal standard deviation
1066
1067 - def __eq__(self,other):
1068 res = False 1069 if isinstance(other,NormalDistribution): 1070 if numpy.allclose(other.mu, self.mu) and numpy.allclose(other.sigma, self.sigma): 1071 res = True 1072 return res
1073
1074 - def __copy__(self):
1075 return NormalDistribution(copy.deepcopy(self.mu), copy.deepcopy(self.sigma))
1076
1077 - def __str__(self):
1078 return "Normal: ["+str(self.mu)+", "+str(self.sigma) + "]"
1079 1080
1081 - def pdf(self, data):
1082 1083 # Valid input arrays will have the form [[sample1],[sample2],...] or 1084 # [sample1,sample2, ...], the latter being the input format to the extension function, 1085 # so we might have to reformat the data 1086 if isinstance(data, DataSet ): 1087 assert data.internalData is not None, "Internal data not initialized." 1088 nr = len(data.internalData) 1089 assert data.internalData.shape == (nr,1), 'shape = '+str(data.internalData.shape) 1090 1091 x = numpy.transpose(data.internalData)[0] 1092 1093 elif isinstance(data, numpy.ndarray): 1094 nr = len(data) 1095 1096 if data.shape == (nr,1): # data format needs to be changed 1097 x = numpy.transpose(data)[0] 1098 elif data.shape == (nr,): 1099 x = data 1100 else: 1101 raise TypeError,'Invalid data shape: '+str(data.shape) 1102 else: 1103 raise TypeError,"Unknown/Invalid input type:"+str(type(data)) 1104 1105 # computing log likelihood 1106 res = _C_mixextend.wrap_gsl_ran_gaussian_pdf(self.mu, self.sigma, x) 1107 1108 return numpy.log(res)
1109
1110 - def sample(self):
1111 return random.normalvariate(self.mu, self.sigma)
1112 1113
1114 - def sampleSet(self,nr):
1115 res = numpy.zeros(nr,dtype='Float64') 1116 1117 for i in range(nr): 1118 res[i] = self.sample() 1119 1120 return res
1121
1122 - def sufficientStatistics(self, posterior, data):
1123 """ 1124 Returns sufficient statistics for a given data set and posterior. In case of the Normal distribution 1125 this is the dot product of a vector of component membership posteriors with the data and the square 1126 of the data. 1127 1128 @param posterior: numpy vector of component membership posteriors 1129 @param data: numpy vector holding the data 1130 1131 @return: list with dot(posterior, data) and dot(posterior, data**2) 1132 """ 1133 return numpy.array( [numpy.dot(posterior, data)[0], numpy.dot(posterior, data**2)[0]], dtype='Float64')
1134 1135
1136 - def MStep(self,posterior,data,mix_pi=None):
1137 # data has to be reshaped for parameter estimation 1138 if isinstance(data,DataSet): 1139 x = data.internalData[:,0] 1140 elif isinstance(data,numpy.ndarray): 1141 x = data[:,0] 1142 1143 else: 1144 raise TypeError, "Unknown/Invalid input to MStep." 1145 nr = len(x) 1146 1147 sh = x.shape 1148 assert sh == (nr,) # XXX debug 1149 1150 post_sum = numpy.sum(posterior) 1151 1152 # checking for valid posterior: if post_sum is zero, this component is invalid 1153 # for this data set 1154 if post_sum != 0.0: 1155 # computing ML estimates for mu and sigma 1156 new_mu = numpy.dot(posterior, x) / post_sum 1157 new_sigma = math.sqrt(numpy.dot(posterior, (x - new_mu)**2 ) / post_sum) 1158 else: 1159 raise InvalidPosteriorDistribution, "Sum of posterior is zero: "+str(self)+" has zero likelihood for data set." 1160 1161 if new_sigma < self.min_sigma: 1162 # enforcing non zero variance estimate 1163 new_sigma = self.min_sigma 1164 1165 # assigning updated parameter values 1166 self.mu = new_mu 1167 self.sigma = new_sigma 1168
1169 - def isValid(self,x):
1170 try: 1171 float(x) 1172 except (ValueError): 1173 #print "Invalid data: ",x,"in NormalDistribution." 1174 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in NormalDistribution."
1175
1176 - def formatData(self,x):
1177 if isinstance(x,list) and len(x) == 1: 1178 x = x[0] 1179 self.isValid(x) # make sure x is valid argument 1180 return [self.p,[x]]
1181 1182
1183 - def flatStr(self,offset):
1184 offset +=1 1185 return "\t"*+offset + ";Norm;"+str(self.mu)+";"+str(self.sigma)+"\n"
1186
1187 - def posteriorTraceback(self,x):
1188 return self.pdf(x)
1189
1190 - def merge(self,dlist, weights):
1191 raise DeprecationWarning, 'Part of the outdated structure learning implementation.' 1192 assert len(dlist)+1 == len(weights) 1193 1194 norm = sum(weights) 1195 m_mu = self.mu * weights[0] 1196 #print self.mu," * ", coeff ,"= ",m_mu 1197 1198 for i in range(len(dlist)): 1199 assert isinstance(dlist[i],NormalDistribution) 1200 1201 #print m_mu, " += ", 1202 1203 m_mu += weights[i+1] * dlist[i].mu 1204 #print dlist[i].mu," * ", coeff ,"= ",m_mu 1205 #m_sigma += coeff * dlist[i].sigma 1206 1207 m_mu = m_mu / norm 1208 m_sigma = weights[0] * ( self.sigma + ( self.mu -m_mu)**2 ) 1209 for i in range(len(dlist)): 1210 m_sigma += weights[i+1] * ( dlist[i].sigma + ( dlist[i].mu - m_mu)**2 ) 1211 1212 self.mu = m_mu 1213 self.sigma = m_sigma / norm
1214 1215 1216
1217 -class MultiNormalDistribution(ProbDistribution):
1218 """ 1219 Multivariate Normal Distribution 1220 1221 """
1222 - def __init__(self,p, mu, sigma):
1223 """ 1224 Constructor 1225 1226 @param p: dimensionality of the distribution 1227 @param mu: mean parameter vector 1228 @param sigma: covariance matrix 1229 """ 1230 1231 assert len(mu) == len(sigma) == len(sigma[0]) == p, str(len(mu))+ ' == ' + str(len(sigma)) + ' == ' + str(len(sigma[0])) + ' == '+ str(p) 1232 self.p = p 1233 self.suff_p = p 1234 self.mu = numpy.array( mu, dtype='Float64') 1235 self.sigma = numpy.array(sigma,dtype='Float64') 1236 self.freeParams = p + p**2
1237 1238
1239 - def __copy__(self):
1240 return MultiNormalDistribution(self.p,self.mu, self.sigma)
1241 1242
1243 - def __str__(self):
1244 return "Normal: ["+str(self.mu)+", "+str(self.sigma.tolist()) + "]"
1245
1246 - def __eq__(self,other):
1247 if not isinstance(other,MultiNormalDistribution): 1248 return False 1249 if self.p != other.p: 1250 return False 1251 if not numpy.allclose( self.mu, other.mu ) or not numpy.allclose( self.sigma, other.sigma): 1252 return False 1253 return True
1254
1255 - def pdf(self, data):
1256 if isinstance(data, DataSet ): 1257 x = data.internalData 1258 elif isinstance(data, numpy.ndarray): 1259 x = data 1260 else: 1261 raise TypeError,"Unknown/Invalid input type." 1262 1263 # initial part of the formula 1264 # this code depends only on the model parameters ... optmize? 1265 dd = la.det(self.sigma); 1266 inverse = la.inv( self.sigma); 1267 ff = math.pow(2*math.pi,-self.p/2.0)*math.pow(dd,-0.5); 1268 1269 # centered input values 1270 centered = numpy.subtract(x,numpy.repeat([self.mu],len(x), axis=0)) 1271 1272 res = ff * numpy.exp(-0.5*numpy.sum(numpy.multiply(centered,numpy.dot(centered,inverse)),1)) 1273 1274 return numpy.log(res)
1275
1276 - def MStep(self,posterior,data,mix_pi=None):
1277 1278 if isinstance(data,DataSet): 1279 x = data.internalData 1280 elif isinstance(data,numpy.ndarray): 1281 x = data 1282 else: 1283 raise TypeError, "Unknown/Invalid input to MStep." 1284 1285 post = posterior.sum() # sum of posteriors 1286 self.mu = numpy.dot(posterior, x)/post 1287 1288 # centered input values (with new mus) 1289 centered = numpy.subtract(x,numpy.repeat([self.mu],len(x), axis=0)); 1290 self.sigma = numpy.dot(numpy.transpose(numpy.dot(numpy.identity(len(posterior))*posterior,centered)),centered)/post
1291 1292
1293 - def sample(self, A = None):
1294 """ 1295 Samples from the mulitvariate Normal distribution. 1296 1297 @param A: optional Cholesky decomposition of the covariance matrix self.sigma, can speed up 1298 the sampling 1299 """ 1300 if A == None: 1301 A = la.cholesky(self.sigma) 1302 1303 z = numpy.zeros(self.p,dtype='Float64') 1304 for i in range(self.p): 1305 z[i] = random.normalvariate(0.0,1.0) # sample p iid N(0,1) RVs 1306 1307 X = numpy.dot(A,z) + self.mu 1308 return X.tolist() # return value of sample must be Python list
1309
1310 - def sampleSet(self,nr):
1311 A = la.cholesky(self.sigma) 1312 res = numpy.zeros((nr,self.p),dtype='Float64') 1313 for i in range(nr): 1314 res[i,:] = self.sample(A=A) 1315 return res
1316
1317 - def isValid(self,x):
1318 if not len(x) == self.p: 1319 raise InvalidDistributionInput, "\n\tInvalid data: wrong dimension(s) "+str(len(x))+" in MultiNormalDistribution(p="+str(self.p)+")." 1320 for v in x: 1321 try: 1322 float(v) 1323 except (ValueError): 1324 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in MultiNormalDistribution."
1325
1326 - def flatStr(self, offset):
1327 offset +=1 1328 return "\t"*offset+";MultiNormal;"+str(self.p)+";"+str(self.mu.tolist())+";"+str(self.sigma.tolist())+"\n"
1329 1330 1331 1332
1333 -class ConditionalGaussDistribution(ProbDistribution):
1334 """ 1335 Constructor for conditional Gauss distributions. Conditional Gaussians 1336 use a sparse formulation of the covariance matrix, which allows computationally 1337 efficient modeling of covariance for high-dimensional data. 1338 1339 Also, the conditional Gaussians implement a tree dependency structure. 1340 1341 (XXX add reference XXX) 1342 """ 1343
1344 - def __init__(self, p, mu, w, sigma, parents):
1345 """ 1346 Constructor 1347 1348 @param p: dimensionality of the distribution 1349 @param mu: mean parameter vector 1350 @param w: covariance weights (representing off-diagonal entries in the full covariance matrix) 1351 @param sigma: standard deviations (diagonal entries of the covariance matrix) 1352 @param parents: parents in the tree structure implied by w 1353 """ 1354 assert p == len(mu) == len(w) == len(sigma) == len(parents) 1355 1356 self.p = p 1357 self.suff_p = p 1358 self.freeParams = p * 3 1359 self.mu = mu # mean values 1360 self.w = w # conditional weights 1361 self.sigma = sigma # standard deviations 1362 1363 self.parents = parents # tree structure encoded by parent index relationship
1364
1365 - def __str__(self):
1366 return 'ConditionalGaussian: \nmu='+str(self.mu)+', \nsigma='+str(self.sigma)+', \nw='+str(self.w)+', \nparents='+str(self.parents)
1367 1368
1369 - def sample(self):
1370 s = [None] * self.p 1371 s[0] = random.normalvariate(self.mu[0], self.sigma[0]) 1372 1373 for i in range(1,self.p): 1374 pid = self.parents[i] 1375 assert s[pid] != None # XXX assumes that the features are in topological order 1376 shift_mu = self.mu[i] - (self.w[i]* self.mu[pid]) 1377 s[i] = random.normalvariate( shift_mu + (self.w[i]* s[pid]), self.sigma[i]) 1378 1379 return s
1380
1381 - def sampleSet(self, nr):
1382 s = numpy.zeros((n,self.p)) 1383 for i in range(nr): 1384 s[i,:] = self.sample() 1385 1386 return s
1387 1388
1389 - def pdf(self, data):
1390 1391 # XXX assume root as first index 1392 assert self.parents[0] == -1 1393 assert self.w[0] == 0.0 1394 1395 res = numpy.zeros(len(data)) 1396 1397 for i in range(len(data)): 1398 res[i] = math.log( (1.0 / (math.sqrt(2.0*math.pi) * self.sigma[0])) * math.exp( ( data[i,0] - self.mu[0] )**2 / (-2.0*self.sigma[0]**2) )) 1399 for j in range(1,self.p): 1400 pind = self.parents[j] 1401 res[i] += math.log( (1.0 / (math.sqrt(2.0*math.pi) * self.sigma[j])) * math.exp( ( data[i,j] - self.mu[j] - self.w[j]* ( data[i,pind]-self.mu[pind] ) )**2 / (-2.0*self.sigma[j]**2) )) 1402 1403 return res
1404 1405
1406 - def MStep(self,posterior,data,mix_pi=None):
1407 var = {} 1408 post_sum = numpy.sum(posterior) 1409 1410 # checking for valid posterior: if post_sum is zero, this component is invalid 1411 # for this data set 1412 if post_sum != 0.0: 1413 # reestimate mu 1414 for j in range(self.p): 1415 self.mu[j] = numpy.dot(posterior, data[:,j]) / post_sum 1416 var[j] = numpy.dot(posterior, (data[:,j] - self.mu[j])**2 ) / post_sum 1417 1418 for j in range(self.p): 1419 # computing ML estimates for w and sigma 1420 pid = self.parents[j] 1421 cov_j = numpy.dot(posterior, (data[:,j] - self.mu[j]) * (data[:,pid] - self.mu[pid])) / post_sum 1422 1423 if pid <> -1: # has parents 1424 self.w[j] = cov_j / var[pid] 1425 print var[j], self.w[j]**2, var[pid], var[j] - (self.w[j]**2 * var[pid]) 1426 self.sigma[j] = math.sqrt( var[j] - (self.w[j]**2 * var[pid]) ) 1427 else: 1428 self.sigma[j] = math.sqrt( var[j]) 1429 1430 else: 1431 raise ValueError, 'Invalid posterior.'
1432 1433
1434 - def isValid(self,x):
1435 if not len(x) == self.p: 1436 raise InvalidDistributionInput, "\n\tInvalid data: wrong dimension(s) "+str(len(x))+" in MultiNormalDistribution(p="+str(self.p)+")." 1437 for v in x: 1438 try: 1439 float(v) 1440 except (ValueError): 1441 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in MultiNormalDistribution."
1442
1443 -class DependenceTreeDistribution(ConditionalGaussDistribution):
1444 """ 1445 This class implemements a tree of conditional Gaussians, including the 1446 tree topology learning. 1447 """
1448 - def __init__(self, p, mu, w, sigma):
1449 """ 1450 Constructor 1451 1452 @param p: dimensionality of the distribution 1453 @param mu: mean parameter vector 1454 @param w: covariance weights (representing off-diagonal entries in the full covariance matrix) 1455 @param sigma: standard deviations (diagonal entries of the covariance matrix) 1456 """ 1457 1458 parents = self.randomStructure(p) 1459 # linear initialization of tree structure 1460 struct = {} 1461 for i in range(p-1): 1462 struct[i+1] = i 1463 struct[0]=-1 1464 ConditionalGaussDistribution.__init__(self,p, mu, w, sigma, struct)
1465 1466
1467 - def MStep(self,posterior,data,mix_pi=None):
1468 if isinstance(data,DataSet): 1469 x = data.internalData 1470 elif isinstance(data,numpy.ndarray): 1471 x = data 1472 else: 1473 raise TypeError, "Unknown/Invalid input to MStep." 1474 1475 post = posterior.sum() # sum of posteriors 1476 self.mu = numpy.dot(posterior, x)/post 1477 1478 # centered input values (with new mus) 1479 centered = numpy.subtract(x,numpy.repeat([self.mu],len(x), axis=0)); 1480 1481 1482 # estimating correlation factor 1483 sigma = numpy.dot(numpy.transpose(numpy.dot(numpy.identity(len(posterior))*posterior,centered)),centered)/post # sigma/covariance matrix 1484 1485 diagsigma = numpy.diagflat(1.0/numpy.diagonal(sigma)) # vector with diagonal entries of sigma matrix 1486 correlation = numpy.dot(numpy.dot(diagsigma,numpy.multiply(sigma,sigma)),diagsigma) # correlation matrix with entries sigma_xy^2/(sigma^2_x * sigma^2_y) 1487 1488 correlation = correlation - numpy.diagflat(numpy.diagonal(correlation)) # making diagonal entries = 0 1489 1490 # XXX - check this 1491 parents = self.maximunSpanningTree(correlation) # return maximun spanning tree from the correlation matrix 1492 self.parents = self.directTree(parents,0) # by default direct tree from 0 1493 1494 1495 # XXX note that computational time could be saved as these functions share same suficient statistics 1496 ConditionalGaussDistribution.MStep(self,posterior,data,mix_pi)
1497 1498
1499 - def maximunSpanningTree(self,weights):
1500 """ 1501 Estimates the MST given a fully connected graph defined by the symetric matrix weights. 1502 using Prim`s algorithm. 1503 1504 @param weights: edge weights 1505 """ 1506 1507 # start with an empty tree and random vertex 1508 edgestree = {} 1509 for i in range(self.p): 1510 edgestree[i] = [] 1511 verticestree = [0] 1512 1513 while len(verticestree) < self.p: 1514 # possible edges = only ones form vertices at the current tree 1515 candidates = weights[verticestree,:] 1516 1517 # look for maximal candidate edges 1518 indices = numpy.argmax(candidates,1) # max neighboors in verticestrrees 1519 values = numpy.max(candidates,1) 1520 uaux = numpy.argmax(values) 1521 u = verticestree[uaux] 1522 v = indices[uaux] 1523 1524 # add (u,v) att tree 1525 edgestree[u].append(v) 1526 edgestree[v].append(u) 1527 #edgestree[v] = u 1528 1529 #zeroing all vertices between v and verticestree 1530 for i in verticestree: 1531 weights[v,i] = 0 1532 weights[i,v] = 0 1533 1534 # add (v) at tree 1535 verticestree.append(v) 1536 1537 return edgestree
1538
1539 - def directTree(self,tree,root):
1540 parent = {} 1541 queue = [] 1542 # directing the tree from the root 1543 parent[root] = -1 1544 visited = numpy.zeros((self.p,1)) 1545 for u in tree[root]: 1546 queue.append((root,u)) 1547 visited[root] = 1 1548 while len(queue) > 0: 1549 (u,v) = queue.pop() 1550 parent[v] = u 1551 for newv in tree[v]: 1552 if not visited[newv]: 1553 queue.append((v,newv)) 1554 visited[v] = 1 1555 return parent
1556 1557
1558 - def randomStructure(self,p):
1559 # linear initialization of tree structure 1560 struct = {} 1561 for i in range(p-1): 1562 struct[i+1] = i 1563 struct[0]=-1 1564 return struct
1565 1566
1567 - def __str__(self):
1568 return 'Dependence Tree: \nmu='+str(self.mu)+' \nsigma='+str(self.sigma)+'\nw='+str(self.w)+'\nparents='+str(self.parents)
1569 1570 1571
1572 -class ExponentialDistribution(ProbDistribution):
1573 """ 1574 Exponential distribution 1575 """
1576 - def __init__(self, lambd):
1577 """ 1578 Constructor 1579 1580 @param lambd: shape parameter lambda 1581 """ 1582 1583 self.p = self.suff_p = 1 1584 self.lambd = lambd # lambd is a rate: 0.0 < lambd <= 1.0 1585 self.freeParams = 1
1586
1587 - def __copy__(self):
1588 return ExponentialDistribution(self.lambd)
1589 1590
1591 - def __str__(self):
1592 return "Exponential: ["+str(self.lambd)+"]"
1593
1594 - def __eq__(self, other):
1595 if not isinstance(other, ExponentialDistribution): 1596 return False 1597 if not self.lambd == other.lambd: 1598 return False 1599 return True
1600 1601
1602 - def pdf(self, data):
1603 if isinstance(data, DataSet ): 1604 assert data.internalData is not None, "Internal data not initialized." 1605 nr = len(data.internalData) 1606 assert data.internalData.shape == (nr,1), 'shape = '+str(data.internalData.shape) 1607 1608 x = numpy.transpose(data.internalData)[0] 1609 1610 elif isinstance(data, numpy.ndarray): 1611 nr = len(data) 1612 1613 if data.shape == (nr,1): # data format needs to be changed 1614 x = numpy.transpose(data)[0] 1615 elif data.shape == (nr,): 1616 x = data 1617 else: 1618 raise TypeError,'Invalid data shape: '+str(data.shape) 1619 else: 1620 raise TypeError,"Unknown/Invalid input type:"+str(type(data)) 1621 1622 return math.log(self.lambd) + (-self.lambd * x) # XXX pure Python implementation for now
1623 1624
1625 - def sample(self):
1626 return random.expovariate(self.lambd)
1627
1628 - def MStep(self,posterior,data,mix_pi=None):
1629 # data has to be reshaped for parameter estimation 1630 if isinstance(data,DataSet): 1631 x = data.internalData[:,0] 1632 elif isinstance(data,numpy.ndarray): 1633 x = data[:,0] 1634 else: 1635 raise TypeError, "Unknown/Invalid input to MStep." 1636 1637 self.lambd = posterior.sum() / numpy.dot(posterior, x)
1638 1639 1640
1641 - def isValid(self,x):
1642 "Checks whether 'x' is a valid argument for the distribution." 1643 try: 1644 float(x) 1645 except ValueError: 1646 #print "Invalid data: ",x,"in ExponentialDistribution." 1647 raise InvalidDistributionInput,"\n\tInvalid data: "+str(x)+" in ExponentialDistribution." 1648 1649 if x < 0: 1650 raise InvalidDistributionInput,"\n\tInvalid data: negative float "+str(x)+" in ExponentialDistribution."
1651
1652 - def formatData(self,x):
1653 """ 1654 """ 1655 if type(x) == list and len(x) == 1: 1656 x = x[0] 1657 self.isValid(x) 1658 return [self.p,[x]]
1659
1660 - def flatStr(self,offset):
1661 offset +=1 1662 return "\t"*offset+";Exp;"+str(self.lambd)+"\n"
1663
1664 - def posteriorTraceback(self,x):
1665 return self.pdf(x)
1666
1667 -class UniformDistribution(ProbDistribution):
1668 """ 1669 Uniform distribution over a given intervall. 1670 """
1671 - def __init__(self, start, end):
1672 """ 1673 Constructor 1674 1675 @param start: begin of interval 1676 @param end: end of interval 1677 """ 1678 assert start < end 1679 1680 self.p = self.suff_p = 1 1681 self.freeParams = 0 1682 1683 self.start = start 1684 self.end = end 1685 self.density = numpy.log(1.0 / (end - start)) # compute log density value only once
1686
1687 - def __eq__(self,other):
1688 raise NotImplementedError
1689
1690 - def __str__(self):
1691 return "Uniform: ["+str(self.start)+","+str(self.end)+"]"
1692
1693 - def __copy__(self):
1694 raise NotImplementedError
1695
1696 - def pdf(self,data):
1697 if isinstance(data, DataSet ): 1698 x = data.internalData 1699 elif isinstance(data, numpy.ndarray): 1700 x = data 1701 else: 1702 raise TypeError,"Unknown/Invalid input type." 1703 res = numpy.zeros(len(x),dtype='Float64') 1704 for i in range(len(x)): 1705 # density is self.density inside the interval and -inf (i.e. 0) outside 1706 if self.start <= x[i][0] <= self.end: 1707 res[i] = self.density 1708 else: 1709 res[i] = float('-inf') 1710 1711 return res
1712
1713 - def MStep(self,posterior,data,mix_pi=None):
1714 # nothing to be done... 1715 pass
1716
1717 - def sample(self):
1718 return random.uniform( self.start, self.end)
1719 1720
1721 - def sampleSet(self,nr):
1722 set = [] 1723 for i in range(nr): 1724 set.append(self.sample()) 1725 return set
1726
1727 - def isValid(self,x):
1728 try: 1729 float(x) 1730 except (ValueError): 1731 raise InvalidDistributionInput, "\n\tInvalid data in "+str(x)+" in UniformDistribution."
1732
1733 - def formatData(self,x):
1734 if isinstance(x,list) and len(x) == 1: 1735 x = x[0] 1736 self.isValid(x) # make sure x is valid argument 1737 return [self.p,[x]]
1738 1739
1740 - def flatStr(self,offset):
1741 raise NotImplementedError, "Boom !"
1742
1743 - def posteriorTraceback(self,x):
1744 raise NotImplementedError, "Kawoom !"
1745
1746 - def update_suff_p(self):
1747 return self.suff_p
1748
1749 - def merge(self,dlist, weights):
1750 raise NotImplementedError, "Kawoom !"
1751 1752 1753 # ------------------------------------------------------------------------ 1754
1755 -class MultinomialDistribution(ProbDistribution):
1756 """ 1757 Multinomial Distribution 1758 """ 1759
1760 - def __init__(self,p, M, phi, alphabet = None,parFix = None):
1761 """ 1762 Constructor 1763 1764 @param M: number of possible outcomes (0 to M-1) 1765 @param p: number of values in each sample 1766 @param phi:= discrete distribution of N objects 1767 @param alphabet: Alphabet object (optional) 1768 @param parFix: list of flags to determine if any elements of phi should be fixed 1769 """ 1770 assert len(phi) == M, "Invalid number of parameters for MultinomialDistribution." 1771 assert abs((1.0 - sum(phi))) < 1e-12, str(phi)+": "+ str(1.0 - sum(phi)) # check parameter validity 1772 1773 self.p = p # lenght of input vectors, corresponds to p in MixtureModel 1774 self.M = M 1775 self.suff_p = M # length of the sufficient statistics, equal to size of alphabet 1776 1777 # in case there is no alphabet specified IntegerRange is used 1778 if alphabet: 1779 assert len(alphabet) == self.M, "Size of alphabet and M does not match: "+str(len(alphabet))+" != "+str(self.M) 1780 self.alphabet = alphabet 1781 else: 1782 self.alphabet = IntegerRange(0, self.M) 1783 1784 if parFix == None: 1785 self.parFix = numpy.array([0] * self.M) 1786 else: 1787 assert len(parFix) == self.M, "Invalid length of parFix vector." 1788 self.parFix = numpy.array(parFix) 1789 1790 # number of free parameters is M-1 minus the number of fixed entries in phi 1791 self.freeParams = M-1 - sum(self.parFix) 1792 1793 self.phi = numpy.array(phi,dtype='Float64') 1794 1795 # minimal value for any component of self.phi, enforced in MStep 1796 self.min_phi = ( 1.0 / self.M ) * 0.001
1797
1798 - def __eq__(self,other):
1799 res = False 1800 if isinstance(other,MultinomialDistribution): 1801 if other.p == self.p and other.M == self.M and numpy.allclose( other.phi,self.phi): 1802 res = True 1803 return res
1804
1805 - def __copy__(self):
1806 "Interface for the copy.copy function" 1807 return MultinomialDistribution(self.p,self.M,copy.deepcopy(self.phi),self.alphabet,parFix=self.parFix)
1808
1809 - def __str__(self):
1810 outstr = "Multinom(M = "+ str(self.M)+", N = "+ str(self.p) +" ) : " + str(self.phi) #+"\n" 1811 #outstr += str(self.alphabet) + "\n" 1812 return outstr
1813
1814 - def pdf(self, data):
1815 # Note: The multinomial coefficient is omitted in the implementation. 1816 # Result is proportional to the true log densitiy which is sufficient for 1817 # the EM. 1818 # gsl computes the true density, including the multinomial coefficient normalizing constant 1819 # therefore it is less efficient than the implementation below 1820 if isinstance(data, DataSet ): 1821 x = data.internalData 1822 elif isinstance(data, numpy.ndarray): 1823 x = data 1824 else: 1825 raise TypeError,"Unknown/Invalid input type." 1826 1827 # switch to log scale for density computation 1828 log_phi = numpy.log(self.phi) 1829 1830 # computing un-normalized density 1831 res = numpy.zeros(len(x),dtype='Float64') 1832 for j in range(len(x)): 1833 for i in range(self.M): 1834 res[j] += (log_phi[i] * x[j,i]) 1835 1836 return res
1837
1838 - def sample(self):
1839 sample = [] 1840 for i in range(self.p): 1841 sum = 0.0 1842 p = random.random() 1843 for k in range(self.M): 1844 sum += self.phi[k] 1845 if sum >= p: 1846 break 1847 sample.append(k) 1848 1849 return map(self.alphabet.external,sample)
1850
1851 - def sampleSet(self,nr):
1852 res = [] 1853 1854 for i in range(nr): 1855 res.append(self.sample()) 1856 1857 return res
1858
1859 - def MStep(self,posterior,data,mix_pi=None):
1860 if isinstance(data,DataSet): 1861 x = data.internalData 1862 elif isinstance(data,numpy.ndarray): 1863 x = data 1864 else: 1865 raise TypeError, "Unknown/Invalid input to MStep." 1866 1867 ind = numpy.where(self.parFix == 0)[0] 1868 fix_flag = 0 1869 fix_phi = 1.0 1870 dsum = 0.0 1871 1872 # reestimating parameters 1873 for i in range(self.M): 1874 if self.parFix[i] == 1: 1875 fix_phi -= self.phi[i] 1876 fix_flag = 1 1877 continue 1878 else: 1879 est = numpy.dot(x[:,i], posterior) 1880 self.phi[i] = est 1881 dsum += est 1882 1883 if dsum == 0.0: 1884 raise InvalidPosteriorDistribution, "Invalid posterior in MStep." 1885 1886 # normalzing parameter estimates 1887 self.phi[ind] = (self.phi[ind] * fix_phi) / dsum 1888 1889 adjust = 0 # adjusting flag 1890 for i in range(self.M): 1891 if self.parFix[i] == 0 and self.phi[i] < self.min_phi: 1892 adjust = 1 1893 self.phi[i] = self.min_phi 1894 1895 # renormalizing the adjusted parameters if necessary 1896 if adjust: 1897 dsum = sum(self.phi[ind]) 1898 self.phi[ind] = (self.phi[ind] * fix_phi) / dsum
1899
1900 - def isValid(self,x):
1901 if sum(map(self.alphabet.isAdmissable,x)) != self.p: 1902 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in MultinomialDistribution("+str(self.alphabet.listOfCharacters)+")."
1903
1904 - def formatData(self,x):
1905 count = [0] * self.M # numpy.zeros(self.M) 1906 1907 # special case of p = 1 1908 if len(x) == 1: 1909 self.isValid( str(x[0]) ) 1910 count[self.alphabet.internal(str(x[0]))] = 1 1911 1912 return [self.M, count] 1913 1914 for i in range(self.M): 1915 self.isValid(x) 1916 f = lambda x: x == self.alphabet.listOfCharacters[i] 1917 count[i] = sum(map(f,x)) 1918 1919 return [self.M, count]
1920
1921 - def flatStr(self,offset):
1922 offset +=1 1923 return "\t"*offset+";Mult;"+str(self.p)+";"+str(self.M)+";"+str(self.phi.tolist())+";"+str(self.alphabet.listOfCharacters)+";"+str(self.parFix.tolist())+"\n"
1924
1925 - def posteriorTraceback(self,x):
1926 return self.pdf(x)
1927
1928 - def merge(self,dlist, weights):
1929 raise DeprecationWarning, 'Part of the outdated structure learning implementation.' 1930 1931 norm = sum(weights) 1932 m_phi = self.phi * weights[0] 1933 for i in range(len(dlist)): 1934 assert isinstance(dlist[i],MultinomialDistribution) 1935 assert dlist[i].p == self.p 1936 assert dlist[i].M == self.M 1937 for j in range(self.M): 1938 m_phi[j] += weights[i+1] * dlist[i].phi[j] 1939 1940 f = lambda x: x/norm 1941 self.phi = numpy.array(map(f, m_phi) , dtype='Float64' )
1942 1943 1944
1945 -class DiscreteDistribution(MultinomialDistribution):
1946 """ 1947 This is the special case of a MultinomialDistribution with p = 1, that is a simple univariate discrete 1948 distribution. Certain key functions are overloaded for increased efficiency. 1949 """
1950 - def __init__(self, M, phi, alphabet = None,parFix = None):
1951 """ 1952 @param M: size of alphabet 1953 @param phi: distribution parameters 1954 @param alphabet: Alphabet object (optional) 1955 @param parFix: list of flags to determine if any elements of phi should be fixed 1956 """ 1957 1958 MultinomialDistribution.__init__(self, 1, M, phi, alphabet = alphabet, parFix = parFix) 1959 self.suff_p = 1
1960
1961 - def __str__(self):
1962 outstr = "DiscreteDist(M = "+ str(self.M)+"): " + str(self.phi) #+"\n" 1963 #outstr += str(self.alphabet) + "\n" 1964 return outstr
1965
1966 - def __copy__(self):
1967 return DiscreteDistribution(self.M,copy.deepcopy(self.phi),self.alphabet,parFix=self.parFix)
1968
1969 - def pdf(self, data):
1970 if isinstance(data, DataSet ): 1971 assert data.p == 1 1972 x = data.getInternalFeature(0) 1973 elif isinstance(data, numpy.ndarray): 1974 x = data 1975 else: 1976 raise TypeError,"Unknown/Invalid input type." 1977 1978 # switch to log scale for density computation 1979 log_phi = numpy.log(self.phi) 1980 1981 # computing un-normalized density 1982 res = log_phi[x[:,0].astype('Int32')] 1983 return res
1984
1985 - def MStep(self,posterior,data,mix_pi=None):
1986 if isinstance(data,DataSet): 1987 x = data.internalData 1988 elif isinstance(data,numpy.ndarray): 1989 x = data 1990 else: 1991 raise TypeError, "Unknown/Invalid input to MStep." 1992 1993 ind = numpy.where(self.parFix == 0)[0] 1994 fix_flag = 0 1995 fix_phi = 1.0 1996 dsum = 0.0 1997 # reestimating parameters 1998 for i in range(self.M): 1999 if self.parFix[i] == 1: 2000 fix_phi -= self.phi[i] 2001 fix_flag = 1 2002 continue 2003 else: 2004 i_ind = numpy.where(x == i)[0] 2005 est = numpy.sum(posterior[i_ind]) 2006 self.phi[i] = est 2007 dsum += est 2008 2009 if dsum == 0.0: 2010 print self 2011 print posterior 2012 2013 raise InvalidPosteriorDistribution, "Invalid posterior in MStep." 2014 2015 # normalzing parameter estimates 2016 self.phi[ind] = (self.phi[ind] * fix_phi) / dsum 2017 2018 adjust = 0 # adjusting flag 2019 for i in range(self.M): 2020 if self.parFix[i] == 0 and self.phi[i] < self.min_phi: 2021 #print "---- enforcing minimal phi -----" 2022 adjust = 1 2023 self.phi[i] = self.min_phi 2024 2025 # renormalizing the adjusted parameters if necessary 2026 if adjust: 2027 dsum = sum(self.phi[ind]) 2028 self.phi[ind] = (self.phi[ind] * fix_phi) / dsum
2029
2030 - def sample(self):
2031 for i in range(self.p): 2032 sum = 0.0 2033 p = random.random() 2034 for k in range(self.M): 2035 sum += self.phi[k] 2036 if sum >= p: 2037 break 2038 return self.alphabet.external(k)
2039
2040 - def sampleSet(self,nr):
2041 res = [] 2042 for i in range(nr): 2043 res.append(self.sample()) 2044 return res
2045
2046 - def sufficientStatistics(self, posterior, data):
2047 stat = numpy.zeros(self.M,dtype='Float64') 2048 for i in range(self.M): 2049 i_ind = numpy.where(data == i)[0] 2050 stat[i] = numpy.sum(posterior[i_ind]) 2051 return stat
2052 2053
2054 - def formatData(self,x):
2055 self.isValid(x) 2056 if type(x) == list: 2057 assert len(x) == 1 2058 internal = self.alphabet.internal( str(x[0])) 2059 else: 2060 internal = self.alphabet.internal( str(x) ) 2061 return [1, [internal] ]
2062
2063 - def isValid(self,x):
2064 if type(x) == str or type(x) == int or type(x) == float: 2065 if not self.alphabet.isAdmissable(str(x)): 2066 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in DiscreteDistribution("+str(self.alphabet.listOfCharacters)+")." 2067 else: 2068 if type(x) == list and len(x) == 1: 2069 self.isValid(x[0]) 2070 else: 2071 raise InvalidDistributionInput, "\n\tInvalid data: "+str(x)+" in DiscreteDistribution("+str(self.alphabet.listOfCharacters)+")."
2072
2073 - def flatStr(self,offset):
2074 offset +=1 2075 return "\t"*offset+";Discrete;"+str(self.M)+";"+str(self.phi.tolist())+";"+str(self.alphabet.listOfCharacters)+";"+str(self.parFix.tolist())+"\n"
2076 2077 2078
2079 -class DirichletPrior(PriorDistribution): # DirichletDistribution,
2080 """ 2081 Dirichlet distribution as Bayesian prior for MultinomialDistribution and derived . 2082 """
2083 - def __init__(self, M ,alpha):
2084 """ 2085 @param M: number of dimensions 2086 @param alpha: distribution parameters 2087 """ 2088 assert M == len(alpha) 2089 #for a in alpha: 2090 # assert a > 0.0, "Invalid parameter." 2091 2092 self.M = M 2093 self.alpha = numpy.array(alpha, dtype='Float64' ) 2094 self.alpha_sum = numpy.sum(alpha) # assumes alphas to remain constant ! 2095 self.p = M 2096 self.suff_p = M 2097 self.freeParams = M 2098 2099 self.constant_hyperparams = 1 # hyperparameters are constant
2100
2101 - def __copy__(self):
2102 cp_alpha = copy.deepcopy(self.alpha) 2103 return DirichletPrior(self.M,cp_alpha)
2104
2105 - def __str__(self):
2106 return "DirichletPrior: "+str(self.alpha)
2107
2108 - def __eq__(self,other):
2109 if isinstance(other,DirichletPrior): 2110 if self.M == other.M and numpy.alltrue( self.alpha == other.alpha ): 2111 return True 2112 else: 2113 return False 2114 else: 2115 return False
2116
2117 - def sample(self):
2118 """ 2119 Samples from Dirichlet distribution 2120 """ 2121 phi = _C_mixextend.wrap_gsl_dirichlet_sample(self.alpha,self.M) 2122 2123 d = DiscreteDistribution(self.M, phi) 2124 return d
2125 2126
2127 - def pdf(self,m):
2128 2129 # XXX should be unified ... 2130 if isinstance(m, MultinomialDistribution): 2131 # use GSL implementation 2132 #res = pygsl.rng.dirichlet_lnpdf(self.alpha,[phi])[0] XXX 2133 try: 2134 res = _C_mixextend.wrap_gsl_dirichlet_lnpdf(self.alpha, [m.phi]) 2135 except ValueError: 2136 print m 2137 print self 2138 raise 2139 2140 return res[0] 2141 2142 elif isinstance(m, list): 2143 in_l = [ d.phi for d in m ] 2144 # use GSL implementation 2145 res = _C_mixextend.wrap_gsl_dirichlet_lnpdf(self.alpha, in_l) 2146 return res 2147 else: 2148 raise TypeError 2149
2150 - def posterior(self,m,x):
2151 """ 2152 Returns the posterior for multinomial distribution 'm' for multinomial count data 'x' 2153 The posterior is again Dirichlet. 2154 """ 2155 assert isinstance(m, MultinomialDistribution) 2156 res = numpy.ones(len(x),dtype='Float64') 2157 for i,d in enumerate(x): 2158 post_alpha = self.alpha + d 2159 res[i] = numpy.log( _C_mixextend.wrap_gsl_dirichlet_pdf(post_alpha,[m.phi])) 2160 2161 return res 2162
2163 - def marginal(self,x):
2164 """ 2165 Returns the log marginal likelihood of multinomial counts 'x' (sufficient statistics) 2166 with Dirichlet prior 'self' integrated over all parameterizations of the multinomial. 2167 """ 2168 # XXX should be eventually replaced by more efficient implementation 2169 # in Dirchlet mixture prior paper (K. Sjoelander,Karplus,..., D.Haussler) 2170 2171 x_sum = sum(x) 2172 2173 term1 = _C_mixextend.wrap_gsl_sf_lngamma(self.alpha_sum) - _C_mixextend.wrap_gsl_sf_lngamma(self.alpha_sum + x_sum) 2174 term2 = 0.0 2175 for i in range(self.p): 2176 term2 += _C_mixextend.wrap_gsl_sf_lngamma(self.alpha[i] + x[i] ) - _C_mixextend.wrap_gsl_sf_lngamma(self.alpha[i]) 2177 2178 res = term1 + term2 2179 return res
2180
2181 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
2182 # Since DiscreteDistribution is a special case of MultinomialDistribution 2183 # the DirichletPrior applies to both. Therefore we have to distinguish the 2184 # two cases here. The cleaner alternative would be to derive specialized prior 2185 # distributions but that would be unnecessarily complicated at this point. 2186 if isinstance(dist, DiscreteDistribution): 2187 ind = numpy.where(dist.parFix == 0)[0] 2188 fix_phi = 1.0 2189 dsum = 0.0 2190 for i in range(dist.M): 2191 if dist.parFix[i] == 1: 2192 fix_phi -= dist.phi[i] 2193 continue 2194 else: 2195 i_ind = numpy.where(data == i)[0] 2196 est = numpy.sum(posterior[i_ind]) + self.alpha[i] -1 2197 dist.phi[i] = est 2198 dsum += est 2199 2200 # normalizing parameter estimates 2201 dist.phi[ind] = (dist.phi[ind] * fix_phi) / dsum 2202 elif isinstance(dist, MultinomialDistribution): 2203 2204 fix_phi = 1.0 2205 dsum = 0.0 2206 # reestimating parameters 2207 for i in range(dist.M): 2208 if dist.parFix[i] == 1: 2209 #print "111" 2210 fix_phi -= dist.phi[i] 2211 continue 2212 else: 2213 est = numpy.dot(data[:,i], posterior) + self.alpha[i] -1 2214 dist.phi[i] = est 2215 dsum += est 2216 2217 if dsum == 0.0: 2218 raise InvalidPosteriorDistribution, "Invalid posterior in MStep." 2219 2220 ind = numpy.where(dist.parFix == 0)[0] 2221 # normalzing parameter estimates 2222 dist.phi[ind] = (dist.phi[ind] * fix_phi) / dsum 2223 2224 else: 2225 raise TypeError,'Invalid input '+str(dist.__class__)
2226 2227
2228 - def mapMStepMerge(self, group_list):
2229 #XXX only for DiscreteDistribution for now, MultinomialDistribution to be done 2230 assert isinstance(group_list[0].dist, DiscreteDistribution), 'only for DiscreteDistribution for now' 2231 2232 pool_req_stat = copy.copy(group_list[0].req_stat) 2233 pool_post_sum = group_list[0].post_sum 2234 pool_pi_sum = group_list[0].pi_sum 2235 2236 for i in range(1,len(group_list)): 2237 pool_req_stat += group_list[i].req_stat 2238 pool_post_sum += group_list[i].post_sum 2239 pool_pi_sum += group_list[i].pi_sum 2240 2241 2242 new_dist = copy.copy(group_list[0].dist) # XXX copy necessary ? 2243 2244 ind = numpy.where(group_list[0].dist.parFix == 0)[0] 2245 fix_phi = 1.0 2246 dsum = 0.0 2247 for i in range(group_list[0].dist.M): 2248 if group_list[0].dist.parFix[i] == 1: 2249 assert group_list[1].dist.parFix[i] == 1 # incomplete consistency check of parFix (XXX) 2250 2251 fix_phi -= new_dist.phi[i] 2252 continue 2253 else: 2254 est = pool_req_stat[i] + self.alpha[i] -1 2255 new_dist.phi[i] = est 2256 2257 dsum += est 2258 2259 # normalizing parameter estimates 2260 new_dist.phi[ind] = (new_dist.phi[ind] * fix_phi) / dsum 2261 2262 return CandidateGroup(new_dist, pool_post_sum, pool_pi_sum, pool_req_stat )
2263 2264
2265 - def mapMStepSplit(self, toSplitFrom, toBeSplit):
2266 #XXX only for DiscreteDistribution for now, MultinomialDistribution to be done 2267 assert isinstance(toSplitFrom.dist, DiscreteDistribution), 'only for DiscreteDistribution for now' 2268 2269 split_req_stat = copy.copy(toSplitFrom.req_stat) 2270 split_req_stat -= toBeSplit.req_stat 2271 2272 split_post_sum = toSplitFrom.post_sum - toBeSplit.post_sum 2273 split_pi_sum = toSplitFrom.pi_sum - toBeSplit.pi_sum 2274 2275 2276 2277 new_dist = copy.copy(toSplitFrom.dist) # XXX copy necessary ? 2278 2279 ind = numpy.where(toSplitFrom.dist.parFix == 0)[0] 2280 fix_phi = 1.0 2281 dsum = 0.0 2282 for i in range(toSplitFrom.dist.M): 2283 if toSplitFrom.dist.parFix[i] == 1: 2284 2285 fix_phi -= new_dist.phi[i] 2286 continue 2287 else: 2288 est = split_req_stat[i] + self.alpha[i] -1 2289 new_dist.phi[i] = est 2290 dsum += est 2291 2292 # normalizing parameter estimates 2293 new_dist.phi[ind] = (new_dist.phi[ind] * fix_phi) / dsum 2294 2295 return CandidateGroup(new_dist, split_post_sum, split_pi_sum, split_req_stat )
2296 2297
2298 - def isValid(self,x):
2299 if not isinstance(x,MultinomialDistribution): 2300 raise InvalidDistributionInput, "in DirichletPrior: "+str(x) 2301 else: 2302 if self.M != x.M: 2303 raise InvalidDistributionInput, "in DirichletPrior: "+str(x)
2304
2305 - def flatStr(self,offset):
2306 offset +=1 2307 return "\t"*offset+";DirichletPr;"+str(self.M)+";"+str(self.alpha.tolist())+"\n"
2308 2309 2310
2311 -class NormalGammaPrior(PriorDistribution):
2312 """ 2313 Inverse-Gamma Normal distribution prior for univariate Normal distribution. 2314 """ 2315
2316 - def __init__(self, mu, kappa, dof, scale ):
2317 """ 2318 Constructor 2319 2320 @param mu: hyper-parameter mu 2321 @param kappa: hyper-parameter kappa 2322 @param dof: hyper-parameter dof 2323 @param scale: hyper-parameter scale 2324 """ 2325 # parameters of the Normal prior on mu | sigma 2326 self.mu_p = float(mu) 2327 self.kappa = float(kappa) 2328 2329 # parameters on the inverse-gamma prior on sigma 2330 self.dof = float(dof) 2331 self.scale = float(scale) 2332 2333 self.constant_hyperparams = 1 # hyperparameters are constant
2334
2335 - def __str__(self):
2336 outstr = "NormalGamma: mu_p="+str(self.mu_p)+", kappa="+str(self.kappa)+", dof="+str(self.dof)+", scale="+str(self.scale) 2337 return outstr
2338
2339 - def __eq__(self,other):
2340 if not isinstance(other, NormalGammaPrior): 2341 return False 2342 if self.mu_p != other.mu_p: 2343 return False 2344 if self.kappa != other.kappa: 2345 return False 2346 if self.dof != other.dof: 2347 return False 2348 if self.scale != other.scale: 2349 return False 2350 return True
2351
2352 - def __copy__(self):
2353 return NormalGammaPrior(self.mu_p, self.kappa, self.dof, self.scale )
2354 2355
2356 - def pdf(self, n):
2357 2358 if isinstance(n,NormalDistribution): 2359 res = _C_mixextend.get_log_normal_inverse_gamma_prior_density( self.mu_p, self.kappa, self.dof, self.scale,[n.mu], [n.sigma] )[0] 2360 return res 2361 2362 elif isinstance(n,list): 2363 # extract parameters, XXX better way to do this ? 2364 d_sigma = numpy.zeros(len(n)) 2365 d_mu = numpy.zeros(len(n)) 2366 for i,d in enumerate(n): 2367 d_sigma[i] = d.sigma 2368 d_mu[i] = d.mu 2369 2370 # call to extension function 2371 return _C_mixextend.get_log_normal_inverse_gamma_prior_density( self.mu_p, self.kappa, self.dof, self.scale, d_mu, d_sigma ) 2372 else: 2373 raise TypeError 2374
2375 - def posterior(self,m,x):
2376 raise NotImplementedError, "Needs implementation" 2377
2378 - def marginal(self,x):
2379 raise NotImplementedError, "Needs implementation"
2380
2381 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
2382 2383 assert isinstance(dist,NormalDistribution) 2384 # data has to be reshaped for parameter estimation 2385 if isinstance(data,DataSet): 2386 x = data.internalData[:,0] 2387 elif isinstance(data,numpy.ndarray): 2388 x = data[:,0] 2389 else: 2390 raise TypeError, "Unknown/Invalid input to MStep." 2391 nr = len(x) 2392 sh = x.shape 2393 assert sh == (nr,) 2394 2395 post_sum = numpy.sum(posterior) # n_k 2396 if post_sum == 0.0: 2397 print dist 2398 raise InvalidPosteriorDistribution, "Sum of posterior is zero." 2399 2400 # computing ML estimates for mu and sigma 2401 ml_mu = numpy.dot(posterior, x) / post_sum # ML estimator for mu 2402 new_mu = ( (post_sum*ml_mu) + (self.kappa * self.mu_p) )/ ( post_sum + self.kappa) 2403 2404 n_sig_num_1 = self.scale + ( (self.kappa * post_sum) / ( self.scale+post_sum ) ) * (ml_mu - self.mu_p)**2 2405 n_sig_num_2 = numpy.dot(posterior, (x - ml_mu)**2 ) 2406 n_sig_num = n_sig_num_1 + n_sig_num_2 2407 n_sig_denom = self.dof + post_sum + 3.0 2408 2409 new_sigma = math.sqrt( n_sig_num / n_sig_denom) 2410 2411 # assigning updated parameter values 2412 dist.mu = new_mu 2413 dist.sigma = new_sigma
2414 2415
2416 - def mapMStepMerge(self, group_list):
2417 2418 pool_req_stat = copy.copy(group_list[0].req_stat) 2419 pool_post_sum = group_list[0].post_sum 2420 pool_pi_sum = group_list[0].pi_sum 2421 2422 for i in range(1,len(group_list)): 2423 pool_req_stat += group_list[i].req_stat 2424 pool_post_sum += group_list[i].post_sum 2425 pool_pi_sum += group_list[i].pi_sum 2426 2427 2428 new_mu = (pool_req_stat[0] + (self.kappa * self.mu_p) ) / ( pool_post_sum + self.kappa) 2429 2430 y = (pool_req_stat[0] ) / ( pool_post_sum) 2431 n_sig_num_1 = self.scale + ( (self.kappa * pool_post_sum) / ( self.scale+pool_post_sum ) ) * (y - self.mu_p)**2 2432 n_sig_num_2 = (pool_req_stat[1]) - 2 * y * (pool_req_stat[0]) + y**2 * pool_post_sum 2433 n_sig_num = n_sig_num_1 + n_sig_num_2 2434 n_sig_denom = self.dof + pool_post_sum + 3.0 2435 2436 new_sigma = math.sqrt( n_sig_num / n_sig_denom) 2437 new_dist = NormalDistribution(new_mu, new_sigma) 2438 2439 return CandidateGroup(new_dist, pool_post_sum, pool_pi_sum, pool_req_stat )
2440
2441 - def mapMStepSplit(self, toSplitFrom, toBeSplit):
2442 2443 split_req_stat = copy.copy(toSplitFrom.req_stat) 2444 split_req_stat -= toBeSplit.req_stat 2445 2446 split_post_sum = toSplitFrom.post_sum - toBeSplit.post_sum 2447 split_pi_sum = toSplitFrom.pi_sum - toBeSplit.pi_sum 2448 2449 new_mu = (split_req_stat[0] + (self.kappa * self.mu_p) ) / ( split_post_sum + self.kappa) 2450 2451 y = (split_req_stat[0] ) / ( split_post_sum) 2452 n_sig_num_1 = self.scale + ( (self.kappa * split_post_sum) / ( self.scale+split_post_sum ) ) * (y - self.mu_p)**2 2453 n_sig_num_2 = (split_req_stat[1]) - 2 * y * (split_req_stat[0]) + y**2 * split_post_sum 2454 n_sig_num = n_sig_num_1 + n_sig_num_2 2455 n_sig_denom = self.dof + split_post_sum + 3.0 2456 new_sigma = math.sqrt( n_sig_num / n_sig_denom) 2457 2458 new_dist = NormalDistribution(new_mu, new_sigma) 2459 2460 return CandidateGroup(new_dist, split_post_sum, split_pi_sum, split_req_stat )
2461
2462 - def flatStr(self,offset):
2463 offset +=1 2464 return "\t"*offset+";NormalGamma;"+str(self.mu_p)+";"+str(self.kappa)+";"+str(self.dof)+";"+str(self.scale)+"\n"
2465
2466 - def isValid(self,x):
2467 if not isinstance(x,NormalDistribution): 2468 raise InvalidDistributionInput, "NormalGammaPrior: "+ str(x)
2469
2470 - def setParams(self, x, K):
2471 """ 2472 Get guesses for hyper-parameters according to the heuristics used in "Bayesian Regularization for Normal 2473 Mixture Estimation and Model-Based Clustering" (C.Fraley and A.E. Raftery) 2474 2475 @param x: numpy data vector 2476 @param K: number of components 2477 """ 2478 nr = len(x) 2479 assert x.shape == (nr,1) 2480 x = x[:,0] 2481 2482 self.mu_p = x.mean() 2483 self.kappa = 0.01 2484 self.dof = 3.0 2485 2486 self.scale = x.var() / (K**2)
2487 2488
2489 -class DirichletMixturePrior(PriorDistribution):
2490 """ 2491 Mixture of Dirichlet distributions prior for multinomial data. 2492 """
2493 - def __init__(self, G, M, pi, dComp):
2494 """ 2495 @param G: number of components 2496 @param M: dimensions of component Dirichlets 2497 @param pi: mixture weights 2498 @param dComp: list of DirichletPrior distributions 2499 """ 2500 assert len(dComp) == len(pi) == G 2501 for d in dComp: 2502 assert d.M == M 2503 2504 self.G = G 2505 self.M = M 2506 self.pi = numpy.array(pi,dtype='Float64') 2507 self.log_pi = numpy.log(self.pi) # assumes pi is not changed from the outside (XXX accesor functions ?) 2508 self.dComp = dComp 2509 self.constant_hyperparams = 1 # hyperparameters are constant
2510
2511 - def __str__(self):
2512 s = ['DirichletMixturePrior( G='+str(self.G)+' )'] 2513 s.append( '\tpi='+str(self.pi)+'\n' ) 2514 for i in range(self.G): 2515 s.append('\t\t'+str(self.dComp[i])+'\n') 2516 return ''.join(s)
2517
2518 - def __eq__(self,other):
2519 if not isinstance(other,DirichletMixturePrior): 2520 return False 2521 if self.G != other.G or self.M != other.M: 2522 return False 2523 if not numpy.alltrue(other.pi == self.pi): 2524 return False 2525 for i,d1 in enumerate(self.dComp): 2526 if not d1 == other.dComp[i]: 2527 return False 2528 return True
2529
2530 - def __copy__(self):
2531 cp_pi = copy.deepcopy(self.pi) 2532 cp_dC = [ copy.deepcopy( self.dComp[i] ) for i in range(self.G) ] 2533 return DirichletMixturePrior(self.G,self.M,cp_pi,cp_dC)
2534
2535 - def pdf(self, m):
2536 if isinstance(m, MultinomialDistribution): # XXX debug 2537 logp_list = numpy.zeros(self.G,dtype='Float64') 2538 for i in range(self.G): 2539 logp_list[i] = self.log_pi[i] + self.dComp[i].pdf(m) 2540 res = sumlogs(logp_list) 2541 return res 2542 2543 elif type(m) == list: 2544 logp_mat = numpy.zeros((self.G, len(m))) 2545 for i in range(self.G): 2546 logp_mat[i,:] = self.dComp[i].pdf(m) 2547 2548 for i in range(len(m)): # XXX slow 2549 logp_mat[:,i] += self.log_pi 2550 2551 res = _C_mixextend.matrix_sum_logs(logp_mat) 2552 return res 2553 else: 2554 raise TypeError 2555 2556
2557 - def marginal(self,dist,posterior,data):
2558 suff_stat = numpy.zeros(self.M,dtype='Float64') 2559 if isinstance(dist,DiscreteDistribution): 2560 for i in range(self.M): 2561 i_ind = numpy.where(data == i)[0] 2562 suff_stat[i] = numpy.sum(posterior[i_ind]) 2563 elif isinstance(dist, MultinomialDistribution): 2564 for i in range(self.M): 2565 suff_stat[i] = numpy.dot(data[:,i], posterior) 2566 else: 2567 raise TypeError,'Invalid input '+str(dist.__class__) 2568 2569 res = 0.0 2570 for i in range(self.G): 2571 res += self.dComp[i].marginal(suff_stat) + numpy.log(self.pi[i]) 2572 return res
2573 2574 2575
2576 - def posterior(self,dist):
2577 """ 2578 Component membership posterior distribution of MultinomialDistribution 'dist'. 2579 2580 @param dist: MultinomialDistribution object 2581 2582 @return: numpy of length self.G containing the posterior of component membership 2583 """ 2584 prior_post = numpy.array([ dirich.pdf(dist) + self.log_pi[i] for i,dirich in enumerate(self.dComp) ], dtype='Float64') 2585 log_sum = sumlogs(prior_post) 2586 prior_post -= log_sum 2587 2588 prior_post = numpy.exp(prior_post) 2589 return prior_post 2590
2591 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
2592 suff_stat = numpy.zeros(self.M,dtype='Float64') 2593 if isinstance(dist,DiscreteDistribution): 2594 for i in range(self.M): 2595 i_ind = numpy.where(data == i)[0] 2596 suff_stat[i] = numpy.sum(posterior[i_ind]) 2597 2598 elif isinstance(dist, MultinomialDistribution): 2599 for i in range(self.M): 2600 suff_stat[i] = numpy.dot(data[:,i], posterior) 2601 else: 2602 raise TypeError,'Invalid input '+str(dist.__class__) 2603 2604 # posterior of the given multinomial distribution 'dist' 2605 # with respect to the components of the Dirichlet mixture prior 2606 prior_post = self.posterior(dist) 2607 2608 fix_flag = 0 2609 fix_phi = 1.0 2610 dsum = 0.0 2611 for i in range(self.M): 2612 if dist.parFix[i] == 1: # checking for fixed entries in phi 2613 fix_flag = 1 2614 fix_phi -= dist.phi[i] 2615 else: # updating phi[i] 2616 e = numpy.zeros(self.G,dtype='Float64') 2617 for k in range(self.G): 2618 e[k] = (suff_stat[i] + self.dComp[k].alpha[i] ) / ( sum(suff_stat) + self.dComp[k].alpha_sum ) 2619 2620 est = numpy.dot(prior_post, e) 2621 dist.phi[i] = est 2622 dsum += est 2623 2624 # re-normalizing parameter estimates if necessary 2625 if fix_flag: 2626 ind = numpy.where(dist.parFix == 0)[0] 2627 dist.phi[ind] = (dist.phi[ind] * fix_phi) / dsum
2628
2629 - def mapMStepMerge(self, group_list):
2630 2631 new_dist = copy.copy(group_list[0].dist) 2632 2633 prior_post = self.posterior(group_list[0].dist) 2634 pool_req_stat = copy.copy(group_list[0].req_stat) 2635 pool_post_sum = group_list[0].post_sum 2636 pool_pi_sum = group_list[0].pi_sum 2637 2638 for i in range(1,len(group_list)): 2639 pool_req_stat += group_list[i].req_stat 2640 pool_post_sum += group_list[i].post_sum 2641 pool_pi_sum += group_list[i].pi_sum 2642 2643 prior_post += self.posterior(group_list[i].dist) * group_list[i].pi_sum 2644 2645 prior_post = prior_post / pool_pi_sum 2646 2647 assert 1.0 - sum(prior_post) < 1e-10, str(prior_post)+' , '+str(sum(prior_post))+', '+ str(1.0 - sum(prior_post)) # XXX debug 2648 2649 fix_flag = 0 2650 fix_phi = 1.0 2651 dsum = 0.0 2652 for i in range(self.M): 2653 if new_dist.parFix[i] == 1: # assumes parFix is consistent 2654 fix_flag = 1 2655 fix_phi -= new_dist.phi[i] 2656 else: # updating phi[i] 2657 e = numpy.zeros(self.G,dtype='Float64') 2658 for k in range(self.G): 2659 e[k] = (pool_req_stat[i] + self.dComp[k].alpha[i]) / ( pool_post_sum + self.dComp[k].alpha_sum ) 2660 2661 est = numpy.dot(prior_post, e) 2662 new_dist.phi[i] = est 2663 dsum += est 2664 2665 #print i,est 2666 2667 # re-normalizing parameter estimates if necessary 2668 if fix_flag: 2669 ind = numpy.where(new_dist.parFix == 0)[0] 2670 new_dist.phi[ind] = (new_dist.phi[ind] * fix_phi) / dsum 2671 2672 return CandidateGroup(new_dist, pool_post_sum, pool_pi_sum, pool_req_stat )
2673 2674
2675 - def isValid(self,x):
2676 if not isinstance(x,MultinomialDistribution): 2677 raise InvalidDistributionInput, "DirichletMixturePrior: " + str(x) 2678 2679 if x.M != self.M: 2680 raise InvalidDistributionInput, "DirichletMixturePrior: unequal dimensions " + str(x.M) + " != "+str(self.M)
2681 2682
2683 - def flatStr(self,offset):
2684 offset +=1 2685 s = "\t"*offset + ";DirichMixPrior;"+str(self.G)+";"+str(self.M)+";"+str(self.pi.tolist())+"\n" 2686 for d in self.dComp: 2687 s+= d.flatStr(offset) 2688 return s
2689 2690
2691 -class ConditionalGaussPrior(PriorDistribution):
2692 """ 2693 Prior over ConditionalGaussDistribution. Assumes Normal prior over the covariance parameters w. 2694 2695 """ 2696
2697 - def __init__(self, nr_comps, p):
2698 """ 2699 Constructor 2700 2701 @param nr_comps: number of components in the mixture the prior is applied to 2702 @param p: number of features in the ConditionalGaussDistribution the prior is applied to 2703 """ 2704 2705 self.constant_hyperparams = 0 # hyperparameters are updated as part of the mapEM 2706 self.nr_comps = nr_comps # number of components in the mixture the prior is applied to 2707 self.p = p # number of features in the ConditionalGaussDistribution the prior is applied to 2708 2709 # no initial value needed, is updated as part of EM in updateHyperparameters 2710 self.beta = numpy.zeros((self.nr_comps, self.p)) 2711 self.nu = numpy.zeros((self.nr_comps, self.p)) 2712 2713 # XXX initialization of sufficient statistics, necessary for hyperparameter updates 2714 self.post_sums = numpy.zeros(self.nr_comps) 2715 self.var = numpy.zeros( (self.nr_comps,self.p) ) 2716 self.cov = numpy.zeros((self.nr_comps,self.p)) 2717 self.mu = numpy.zeros((self.nr_comps,self.p))
2718 2719
2720 - def __str__(self):
2721 return 'ConditionalGaussPrior(beta='+str(self.beta)+')'
2722 2723
2724 - def pdf(self, d):
2725 if type(d) == list: 2726 N = numpy.sum(self.post_sums) 2727 2728 res = numpy.zeros( len(d)) 2729 for i in range(len(d)): 2730 for j in range(1,d[i].p): 2731 pid = d[i].parents[j] 2732 res[i] += (1.0/self.cov[i,j]**2) / (self.nu[i,j] * (self.post_sums[i]/N)) 2733 res[i] += numpy.log(_C_mixextend.wrap_gsl_ran_gaussian_pdf(0.0, 2734 math.sqrt((self.beta[i,j] * self.cov[i,j]**2) / (self.var[i,pid] * (self.post_sums[i]/N) )), 2735 [d[i].w[j]] )) 2736 else: 2737 raise TypeError, 'Invalid input '+str(type(d)) 2738 2739 return res 2740 2741
2742 - def posterior(self,m,x):
2743 raise NotImplementedError, "Needs implementation" 2744
2745 - def marginal(self,x):
2746 raise NotImplementedError, "Needs implementation"
2747 2748
2749 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind=None):
2750 assert not dist_ind == None # XXX debug 2751 2752 post_sum = numpy.sum(posterior) 2753 self.post_sums[dist_ind] = post_sum 2754 2755 # checking for valid posterior: if post_sum is zero, this component is invalid 2756 # for this data set 2757 if post_sum != 0.0: 2758 2759 # reestimate mu 2760 for j in range(dist.p): 2761 # computing ML estimates for w and sigma 2762 self.mu[dist_ind,j] = numpy.dot(posterior, data[:,j]) / post_sum 2763 #self.var[dist_ind,j] = numpy.dot(posterior, (data[:,j] - dist.mu[j])**2 ) / post_sum 2764 self.var[dist_ind,j] = numpy.dot(posterior, (data[:,j] - self.mu[dist_ind,j])**2 ) / post_sum 2765 2766 if j > 0: # w[0] = 0.0 is fixed 2767 pid = dist.parents[j] 2768 self.cov[dist_ind,j] = numpy.dot(posterior, (data[:,j] - self.mu[dist_ind,j]) * (data[:,pid] - self.mu[dist_ind,pid])) / post_sum 2769 2770 # update hyperparameters beta 2771 self.beta[dist_ind,j] = post_sum / ( (( self.var[dist_ind,j] * self.var[dist_ind,pid]) / self.cov[dist_ind,j]**2) -1 ) 2772 2773 # update hyperparameters nu 2774 self.nu[dist_ind,j] = - post_sum / (2 * dist.sigma[j]**2) 2775 2776 # update regression weights 2777 dist.w[j] = self.cov[dist_ind,j] / (dist.sigma[pid]**2 * (1+ self.beta[dist_ind,j]**-1 ) ) 2778 2779 # update standard deviation 2780 dist.sigma[j] = math.sqrt( self.var[dist_ind,j] - (dist.w[j]**2 * dist.sigma[pid]**2 * (1+ (1.0/self.beta[dist_ind,j])) ) - self.nu[dist_ind,j]**-1 ) 2781 # update means 2782 dist.mu[j] = self.mu[dist_ind,j] #- (dist.w[j] * self.mu[dist_ind,pid]) 2783 2784 else: 2785 dist.sigma[j] = math.sqrt( self.var[dist_ind,j] ) # root variance 2786 dist.mu[j] = self.mu[dist_ind,j] 2787 2788
2789 - def updateHyperparameters(self, dists, posterior, data):
2790 """ 2791 Updates the hyperparamters in an empirical Bayes fashion as part of the EM parameter estimation. 2792 2793 (XXX add reference) 2794 """ 2795 assert len(dists) == posterior.shape[0] # XXX debug 2796 2797 # update component-specific hyperparameters 2798 for i in range(self.nr_comps): 2799 self.post_sums[i] = numpy.sum(posterior[i,:]) 2800 for j in range(0,self.p): 2801 # var_j = numpy.dot(posterior, (data[:,j] - dist.mu[j])**2 ) / post_sum 2802 self.var[i,j] = numpy.dot(posterior[i,:], (data[:,j] - dists[i].mu[j])**2 ) / self.post_sums[i] 2803 2804 if j > 0: # feature 0 is root by convention 2805 pid_i_j = dists[i].parents[j] 2806 self.cov[i,j] = numpy.dot(posterior[i,:], (data[:,j] - dists[i].mu[j]) * (data[:,pid_i_j] - dists[i].mu[pid_i_j])) / self.post_sums[i] 2807 self.beta[i,j] = self.post_sums[i] / ( (( self.var[i,j] * self.var[i,pid_i_j]) / self.cov[i,j]**2) -1 ) 2808 self.nu[i,j] = - self.post_sums[i] / (2 * dists[i].sigma[j]**2) 2809 2810
2811 - def isValid(self,x):
2812 if not isinstance(x,ConditionalGaussDistribution): 2813 raise InvalidDistributionInput, "ConditionalGaussPrior: " + str(x)
2814 2815 2816
2817 -class ProductDistributionPrior(PriorDistribution):
2818 """ 2819 Prior for ProductDistribution objects. Basically only holds a list of priors for 2820 atomar distributions. Necessary for model hierarchy. 2821 """
2822 - def __init__(self,priorList):
2823 """ 2824 Constructor 2825 2826 @param priorList: list of PriorDistribution objects 2827 """ 2828 self.priorList = priorList 2829 self.dist_nr = len(priorList)
2830
2831 - def __getitem__(self,ind):
2832 if ind < 0 or ind > self.dist_nr-1: 2833 raise IndexError, 'Index '+str(ind) 2834 else: 2835 return self.priorList[ind]
2836
2837 - def __setitem__(self,ind, item):
2838 assert isinstance(item,PriorDistribution) 2839 2840 if ind < 0 or ind > self.dist_nr-1: 2841 raise IndexError 2842 else: 2843 self.priorList[ind] = item
2844
2845 - def __eq__(self, other):
2846 if not isinstance(other,ProductDistributionPrior): 2847 return False 2848 if self.dist_nr != other.dist_nr: 2849 return False 2850 for i in range(self.dist_nr): 2851 if not self.priorList[i] == other.priorList[i]: 2852 return False 2853 return True
2854
2855 - def pdf(self,dist):
2856 assert isinstance(dist,ProductDistribution) 2857 res = 0 2858 for i in range(self.dist_nr): 2859 res += self.priorList[i].pdf(dist.distList[i]) 2860 2861 return res
2862
2863 - def marginal(self,dist,posterior,data):
2864 assert isinstance(dist,ProductDistribution) 2865 res = 0 2866 for i in range(self.dist_nr): 2867 res += self.priorList[i].marginal(dist.distList[i],posterior,data.getInternalFeature(i)) 2868 2869 return res
2870
2871 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
2872 assert dist.suff_dataRange and dist.suff_p, "Attributes for sufficient statistics not initialized." 2873 assert isinstance(data,DataSet) 2874 assert isinstance(dist,ProductDistribution) 2875 assert dist.dist_nr == len(self.priorList) 2876 2877 for i in range(dist.dist_nr): 2878 if isinstance(dist.distList[i],MixtureModel): 2879 # XXX use of isinstance() should be removed, singleFeatureSubset(i) to replace getInternalFeature(i) ? 2880 self.priorList[i].mapMStep(dist.distList[i],posterior,data.singleFeatureSubset(i), mix_pi, dist_ind) 2881 else: 2882 self.priorList[i].mapMStep(dist.distList[i],posterior,data.getInternalFeature(i), mix_pi, dist_ind)
2883 2884
2885 - def isValid(self,p):
2886 if not isinstance(p,ProductDistribution): 2887 raise InvalidDistributionInput,'Not a ProductDistribution.' 2888 if p.dist_nr != self.dist_nr: 2889 raise InvalidDistributionInput,'Different dimensions in ProductDistributionPrior and ProductDistribution: ' + str(p.dist_nr)+' != '+ str(self.dist_nr) 2890 for j in range(p.dist_nr): 2891 try: 2892 self[j].isValid(p[j]) 2893 except InvalidDistributionInput,ex: 2894 ex.message += "\n\tin ProductDistributionPrior.priorList["+str(j)+"]" 2895 raise
2896 2897
2898 -class ProductDistribution(ProbDistribution):
2899 """ Class for joined distributions for a vector of random variables with (possibly) different 2900 types. We assume indepenence between the features. 2901 Implements the naive Bayes Model. 2902 2903 """
2904 - def __init__(self, distList):
2905 """ 2906 Constructor 2907 2908 @param distList: list of ProbDistribution objects 2909 """ 2910 # initialize attributes 2911 self.distList = distList 2912 self.p = 0 2913 self.freeParams = 0 2914 self.dataRange = [] 2915 self.dist_nr = len(distList) 2916 2917 # dimension and dataRange for sufficient statistics data 2918 self.suff_p = 0 2919 self.suff_dataRange = None 2920 for dist in distList: 2921 assert isinstance(dist,ProbDistribution) 2922 self.p += dist.p 2923 self.dataRange.append(self.p) 2924 self.freeParams += dist.freeParams 2925 self.suff_p += dist.suff_p 2926 2927 # initializing dimensions for sufficient statistics data 2928 self.update_suff_p()
2929
2930 - def __eq__(self,other):
2931 if other.p != self.p or other.dist_nr != self.dist_nr: 2932 return False 2933 for i in range(self.dist_nr): 2934 if not (other.distList[i] == self.distList[i]): 2935 return False 2936 return True
2937
2938 - def __copy__(self):
2939 copyList = [] 2940 for i in range(len(self.distList)): 2941 copyList.append(copy.copy(self.distList[i])) 2942 2943 copy_pd = ProductDistribution(copyList) 2944 copy_pd.suff_p = self.suff_p 2945 copy_pd.suff_dataRange = copy.copy(self.suff_dataRange) 2946 return copy_pd
2947
2948 - def __str__(self):
2949 outstr = "ProductDist: \n" 2950 for dist in self.distList: 2951 outstr += " " + str(dist) + "\n" 2952 return outstr
2953
2954 - def __getitem__(self,ind):
2955 if ind < 0 or ind > self.dist_nr-1: 2956 raise IndexError 2957 else: 2958 return self.distList[ind]
2959
2960 - def __setitem__(self, ind, value):
2961 if ind < 0 or ind > self.dist_nr-1: 2962 raise IndexError 2963 else: 2964 self.distList[ind] = value
2965
2966 - def __len__(self):
2967 return self.dist_nr
2968
2969 - def pdf(self,data):
2970 assert self.suff_dataRange and self.suff_p, "Attributes for sufficient statistics not initialized." 2971 if isinstance(data,DataSet): 2972 res = numpy.zeros(data.N, dtype='Float64') 2973 for i in range(self.dist_nr): 2974 if isinstance(self.distList[i], MixtureModel): # XXX only necessary for mixtures of mixtures 2975 res += self.distList[i].pdf(data.singleFeatureSubset(i)) 2976 else: 2977 res += self.distList[i].pdf(data.getInternalFeature(i)) 2978 return res 2979 else: 2980 raise TypeError, 'DataSet object required, got '+ str(type(data))
2981
2982 - def sample(self):
2983 ls = [] 2984 for i in range(len(self.distList)): 2985 s = self.distList[i].sample() 2986 if type(s) != list: 2987 ls.append(s) 2988 else: 2989 ls += s 2990 return ls
2991
2992 - def sampleSet(self,nr):
2993 res = [] 2994 for i in range(nr): 2995 res.append(self.sample() ) 2996 return res
2997
2998 - def sampleDataSet(self, nr):
2999 """ 3000 Returns a DataSet object of size 'nr'. 3001 3002 @param nr: size of DataSet to be sampled 3003 3004 @return: DataSet object 3005 """ 3006 ls = [] 3007 for i in range(nr): 3008 ls.append(self.sample()) 3009 3010 data = DataSet() 3011 data.dataMatrix= ls 3012 data.N = nr 3013 data.p = self.p 3014 data.sampleIDs = [] 3015 3016 for i in range(data.N): 3017 data.sampleIDs.append("sample"+str(i)) 3018 3019 for h in range(data.p): 3020 data.headers.append("X_"+str(h)) 3021 3022 data.internalInit(self) 3023 return data
3024
3025 - def MStep(self,posterior,data,mix_pi=None):
3026 3027 assert self.suff_dataRange and self.suff_p, "Attributes for sufficient statistics not initialized." 3028 assert isinstance(data,DataSet),'DataSet required, got '+str(type(data))+'.' 3029 3030 for i in range(self.dist_nr): 3031 if isinstance(self.distList[i],MixtureModel): 3032 self.distList[i].MStep(posterior,data.singleFeatureSubset(i),mix_pi) 3033 else: 3034 self.distList[i].MStep(posterior,data.getInternalFeature(i))
3035
3036 - def formatData(self,x):
3037 res = [] 3038 last_index = 0 3039 for i in range(len(self.distList)): 3040 3041 # XXX HACK: if distList[i] is an HMM feature there is nothing to be done 3042 # since all the HMM code was moved to mixtureHMM.py we check whether self.distList[i] 3043 # is an HMM by string matching __class__ (for now). 3044 if self.distList[i].p == 1: 3045 strg = str(self.distList[i].__class__) 3046 if strg == 'mixtureHMM.HMM': 3047 continue 3048 3049 if self.dist_nr == 1: 3050 [new_p,dat] = self.distList[i].formatData(x) 3051 res += dat 3052 else: 3053 if self.distList[i].suff_p == 1: 3054 [new_p,dat] = self.distList[i].formatData(x[ self.dataRange[i] -1]) 3055 res += dat 3056 else: 3057 [new_p,dat] = self.distList[i].formatData(x[last_index:self.dataRange[i]]) 3058 res += dat 3059 last_index = self.dataRange[i] 3060 return [self.suff_p,res]
3061
3062 - def isValid(self,x):
3063 last_index = 0 3064 for i in range(len(self.distList)): 3065 if self.distList[i].p == 1: 3066 try: 3067 self.distList[i].isValid(x[self.dataRange[i]-1]) 3068 except InvalidDistributionInput,ex: 3069 ex.message += "\n\tin ProductDistribution.distList["+str(i)+"]" 3070 raise 3071 else: 3072 try: 3073 self.distList[i].isValid(x[last_index:self.dataRange[i]]) 3074 except InvalidDistributionInput,ex: 3075 ex.message += "\n\tin ProductDistribution.distList["+str(i)+"]" 3076 raise 3077 last_index = self.dataRange[i]
3078
3079 - def flatStr(self,offset):
3080 offset +=1 3081 s = "\t"*offset + ";Prod;"+str(self.p)+"\n" 3082 for d in self.distList: 3083 s+= d.flatStr(offset) 3084 return s
3085
3086 - def posteriorTraceback(self,x):
3087 res = [] 3088 last_index = 0 3089 assert len(x) == self.suff_p, "Different number of dimensions in data and model." 3090 3091 for i in range(len(self.distList)): 3092 if self.distList[i].suff_p == 1: 3093 res += self.distList[i].posteriorTraceback(x[:,self.suff_dataRange[i]-1]) 3094 else: 3095 res += self.distList[i].posteriorTraceback(x[:,last_index:self.suff_dataRange[i] ]) 3096 3097 last_index = self.suff_dataRange[i] 3098 return res
3099
3100 - def update_suff_p(self):
3101 old_suff_p = None 3102 # in case suff_ variables have already been initialized, 3103 # store old values and compare as consistency check 3104 if self.suff_dataRange is not None and self.suff_p: 3105 old_suff_dataRange = self.suff_dataRange 3106 old_suff_p = self.suff_p 3107 3108 self.suff_dataRange = [] 3109 self.suff_p = 0 3110 for dist in self.distList: 3111 self.suff_p += dist.update_suff_p() 3112 self.suff_dataRange.append(self.suff_p) 3113 3114 if old_suff_p: 3115 assert self.suff_p == old_suff_p,str(self.suff_p)+" != "+ str(old_suff_p) 3116 assert old_suff_dataRange == self.suff_dataRange,str(old_suff_dataRange)+" != "+ str(self.suff_dataRange) 3117 3118 return self.suff_p
3119 3120 #-------------------------------------------------------------------------------------------- 3121 3122
3123 -class MixtureModelPrior(PriorDistribution):
3124 """ 3125 Mixture model prior. 3126 """
3127 - def __init__(self, structPrior, nrCompPrior,piPrior, compPrior):
3128 """ 3129 Constructor 3130 3131 @param structPrior: hyperparameter over structure complexity (0< structPrior < 1), stored on log scale internally 3132 @param nrCompPrior: hyperparameter over number of components (0< nrCompPrior < 1), stored on log scale internally 3133 @param piPrior: DirichletPrior object 3134 @param compPrior: list of PriorDistribution objects 3135 """ 3136 assert isinstance(piPrior,DirichletPrior) 3137 self.piPrior = piPrior 3138 3139 for p in compPrior: 3140 assert isinstance(p,PriorDistribution) 3141 3142 self.compPrior = ProductDistributionPrior(compPrior) 3143 self.dist_nr = len(compPrior) 3144 self.structPrior = numpy.log(structPrior) 3145 self.nrCompPrior = numpy.log(nrCompPrior) 3146 3147 self.constant_hyperparams = 1 3148 self.hp_update_indices = [] 3149 for i in range(self.dist_nr): 3150 if self.compPrior.priorList[i].constant_hyperparams == 0: 3151 self.hp_update_indices.append(i) 3152 3153 if len(self.hp_update_indices) > 0: 3154 self.constant_hyperparams = 0 # there is at least one prior which requires hyper parameter updates
3155 3156 3157
3158 - def __str__(self):
3159 outstr = "MixtureModelPrior: \n" 3160 outstr += "dist_nr="+str(self.dist_nr)+"\n" 3161 outstr += "structPrior ="+str(self.structPrior) +"\n" 3162 outstr += "nrCompPrior ="+str(self.nrCompPrior) +"\n" 3163 outstr += " piPrior = "+str(self.piPrior)+"\n" 3164 for dist in self.compPrior: 3165 outstr += " " + str(dist) + "\n" 3166 3167 return outstr
3168 3169
3170 - def __eq__(self,other):
3171 if not isinstance(other,MixtureModelPrior): 3172 return False 3173 3174 if self.structPrior != other.structPrior or self.nrCompPrior != other.nrCompPrior: 3175 return False 3176 if not self.piPrior == other.piPrior: 3177 return False 3178 3179 if not self.compPrior == other.compPrior: 3180 return False 3181 3182 return True
3183 3184
3185 - def __copy__(self):
3186 cp_pi = copy.copy(self.piPrior) 3187 cp_comp = [] 3188 for i in range(self.dist_nr): 3189 cp_comp.append( copy.copy(self.compPrior[i] )) 3190 3191 # initialise copy with dummy values for .structPrior, .nrCompPrior 3192 cp_pr = MixtureModelPrior( 0.0, 0.0, cp_pi, cp_comp ) 3193 # set values of hyperparameters 3194 cp_pr.structPrior = self.structPrior 3195 cp_pr.nrCompPrior = self.nrCompPrior 3196 3197 return cp_pr
3198 3199
3200 - def pdf(self, mix):
3201 #assert isinstance(mix,MixtureModel), str(mix.__class__) 3202 #assert len(self.compPrior) == mix.components[0].dist_nr 3203 temp = DiscreteDistribution(mix.G,mix.pi) 3204 res = self.piPrior.pdf(temp) 3205 3206 # XXX fixed components do not contribute to the prior (HACK) 3207 # this is needed if we use mixtures of mixtures to model missing data XXX 3208 if sum(mix.compFix) > 0: 3209 for j in range(mix.components[0].dist_nr): 3210 for l in range(mix.G): 3211 if mix.compFix[l] != 2: 3212 p = self.compPrior[j].pdf( mix.components[l][j] ) 3213 res += p 3214 else: 3215 # operate column wise on mix.components 3216 for j in range(self.dist_nr): 3217 3218 if not isinstance(mix.components[0].distList[j], MixtureModel): 3219 d_j = [mix.components[i].distList[j] for i in range(mix.G)] 3220 res += numpy.sum(self.compPrior[j].pdf(d_j)) 3221 else: 3222 for i in range(mix.G): 3223 res += self.compPrior[j].pdf(mix.components[i][j]) 3224 3225 3226 # prior over number of components 3227 res += self.nrCompPrior * mix.G 3228 3229 # prior over number of distinct groups 3230 if mix.struct: 3231 for j in range(mix.components[0].dist_nr): 3232 res += self.structPrior * len(mix.leaders[j]) 3233 else: 3234 for j in range(mix.components[0].dist_nr): 3235 res += self.structPrior * mix.G 3236 3237 if numpy.isnan(res): 3238 # uncomment code below for detailed information where the nan value came from (DEBUG) 3239 # print '--------------------------------' 3240 # print 'MixtureModelPrior.pdf ',res 3241 # temp = DiscreteDistribution(mix.G,mix.pi) 3242 # print ' MixPrior.pdf.pi:',temp,self.piPrior.pdf(temp) 3243 # if sum(mix.compFix) > 0: 3244 # for j in range(mix.components[0].dist_nr): 3245 # for l in range(mix.G): 3246 # if mix.compFix[l] != 2: 3247 # p = self.compPrior[j].pdf( mix.components[l][j] ) 3248 # print l,j,p 3249 # else: 3250 # for l in range(mix.G): 3251 # #print mix.components[l] 3252 # print ' comp ',l,':',self.compPrior.pdf(mix.components[l]) 3253 # for o,d in enumerate(self.compPrior): 3254 # print ' dist',o,mix.components[l][o],':',d.pdf(mix.components[l][o]) 3255 # 3256 # print 'nrCompPrior=', self.nrCompPrior * mix.G 3257 # 3258 # # prior over number of distinct groups 3259 # if mix.struct: 3260 # for j in range(mix.components[0].dist_nr): 3261 # print ' struct:',self.structPrior * len(mix.leaders[j]) 3262 # else: 3263 # for j in range(mix.components[0].dist_nr): 3264 # print ' struct:', self.structPrior * mix.G 3265 raise ValueError, 'nan result in MixtureModelPrior.pdf' 3266 return res
3267 3268 3269 3270 # mapMStep is used for parameter estimation of lower hierarchy mixtures
3271 - def mapMStep(self, dist, posterior, data, mix_pi=None, dist_ind = None):
3272 dist.mapEM(data, self,1,0.1,silent =True,mix_pi=mix_pi,mix_posterior=posterior)
3273
3274 - def updateHyperparameters(self, dists, posterior, data):
3275 3276 assert self.constant_hyperparams == 0 3277 assert isinstance(dists, MixtureModel) # XXX debug 3278 3279 for j in self.hp_update_indices: 3280 d_j = [dists.components[i].distList[j] for i in range(dists.G)] 3281 self.compPrior[j].updateHyperparameters(d_j,posterior,data.getInternalFeature(j) ) 3282 3283 3284
3285 - def flatStr(self,offset):
3286 offset +=1 3287 s = "\t"*offset + ";MixPrior;"+str(self.dist_nr)+";"+str(numpy.exp(self.structPrior))+";"+str(numpy.exp(self.nrCompPrior))+"\n" 3288 3289 s+= self.piPrior.flatStr(offset) 3290 for d in self.compPrior: 3291 s+= d.flatStr(offset) 3292 3293 return s
3294
3295 - def posterior(self, dist):
3296 raise NotImplementedError
3297
3298 - def isValid(self, m):
3299 if not isinstance(m,MixtureModel): 3300 raise InvalidDistributionInput, "MixtureModelPrior: " + str(m) 3301 else: 3302 if self.piPrior.M != m.G: 3303 raise InvalidDistributionInput, "MixtureModelPrior: invalid size of piPrior." 3304 3305 try: 3306 # check validity of each component 3307 for i in range(m.G): 3308 self.compPrior.isValid(m.components[i]) 3309 except InvalidDistributionInput,ex: 3310 ex.message += "\n\tin MixtureModelPrior for component "+str(i) 3311 raise
3312
3313 - def structPriorHeuristic(self, delta, N):
3314 """ 3315 Heuristic for setting the structure prior hyper-parameter 'self.structPrior', depending 3316 on the size of a data set 'N' and parameter 'delta'. 3317 """ 3318 self.structPrior = - numpy.log(1+delta)*N
3319 3320
3321 - def mapMStepMerge(self, group_list):
3322 new_dist = copy.copy(group_list[0].dist) 3323 new_req_stat = copy.deepcopy(group_list[0].req_stat) 3324 3325 assert new_dist.dist_nr == 1 # XXX 3326 assert new_dist.G == 2 3327 3328 for i in range(new_dist.G): 3329 if new_dist.compFix[i] == 2: 3330 continue 3331 sub_group_list = [] 3332 for r in range(len(group_list)): 3333 sub_group_list.append( CandidateGroup( group_list[r].dist.components[i][0], group_list[r].post_sum, group_list[r].pi_sum, group_list[r].req_stat[i] ) ) 3334 3335 d_i = self.compPrior[0].mapMStepMerge(sub_group_list) 3336 new_dist.components[i][0] = d_i.dist 3337 new_req_stat[i] = d_i.req_stat 3338 3339 return CandidateGroup(new_dist, d_i.post_sum, d_i.pi_sum, new_req_stat )
3340 3341 3342 3343
3344 -class MixtureModel(ProbDistribution):
3345 """ 3346 Class for a context-specific independence (CSI) mixture models. 3347 The components are naive Bayes models (i.e. ProductDistribution objects). 3348 """
3349 - def __init__(self,G, pi, components,compFix=None,struct=0, identifiable = 1):
3350 """ 3351 Constructor 3352 3353 @param G: number of components 3354 @param pi: mixture weights 3355 @param components: list of ProductDistribution objects, each entry is one component 3356 @param compFix: list of optional flags for fixing components in the reestimation 3357 the following values are supported: 1 distribution parameters are fixed, 2 distribution 3358 parameters and mixture coefficients are fixed 3359 @param struct: Flag for CSI structure, 0 = no CSI structure, 1 = CSI structure 3360 """ 3361 assert len(pi) == len(components) == G, str(len(pi)) +', '+ str(len(components))+', '+str(G) 3362 assert abs((1.0 - sum(pi))) < 1e-12, "sum(pi) = " + str(sum(pi)) +", "+str(abs((1.0 - sum(pi)))) 3363 3364 self.freeParams = 0 3365 self.p = components[0].p 3366 3367 # Internally components must be a list of ProductDistribution objects. In case the input is a list of 3368 # ProbDistributions we convert components accordingly. 3369 if isinstance(components[0],ProbDistribution) and not isinstance(components[0],ProductDistribution): 3370 # make sure all elements of components are ProbDistribution of the same dimension 3371 for c in components: 3372 assert isinstance(c,ProbDistribution) 3373 assert c.p <