001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math.stat.ranking; 019 020 import java.util.ArrayList; 021 import java.util.Arrays; 022 import java.util.Iterator; 023 import java.util.List; 024 025 import org.apache.commons.math.random.RandomData; 026 import org.apache.commons.math.random.RandomDataImpl; 027 import org.apache.commons.math.random.RandomGenerator; 028 029 030 /** 031 * <p> Ranking based on the natural ordering on doubles.</p> 032 * <p>NaNs are treated according to the configured {@link NaNStrategy} and ties 033 * are handled using the selected {@link TiesStrategy}. 034 * Configuration settings are supplied in optional constructor arguments. 035 * Defaults are {@link NaNStrategy#MAXIMAL} and {@link TiesStrategy#AVERAGE}, 036 * respectively. When using {@link TiesStrategy#RANDOM}, a 037 * {@link RandomGenerator} may be supplied as a constructor argument.</p> 038 * <p>Examples: 039 * <table border="1" cellpadding="3"> 040 * <tr><th colspan="3"> 041 * Input data: (20, 17, 30, 42.3, 17, 50, Double.NaN, Double.NEGATIVE_INFINITY, 17) 042 * </th></tr> 043 * <tr><th>NaNStrategy</th><th>TiesStrategy</th> 044 * <th><code>rank(data)</code></th> 045 * <tr> 046 * <td>default (NaNs maximal)</td> 047 * <td>default (ties averaged)</td> 048 * <td>(5, 3, 6, 7, 3, 8, 9, 1, 3)</td></tr> 049 * <tr> 050 * <td>default (NaNs maximal)</td> 051 * <td>MINIMUM</td> 052 * <td>(5, 2, 6, 7, 2, 8, 9, 1, 2)</td></tr> 053 * <tr> 054 * <td>MINIMAL</td> 055 * <td>default (ties averaged)</td> 056 * <td>(6, 4, 7, 8, 4, 9, 1.5, 1.5, 4)</td></tr> 057 * <tr> 058 * <td>REMOVED</td> 059 * <td>SEQUENTIAL</td> 060 * <td>(5, 2, 6, 7, 3, 8, 1, 4)</td></tr> 061 * <tr> 062 * <td>MINIMAL</td> 063 * <td>MAXIMUM</td> 064 * <td>(6, 5, 7, 8, 5, 9, 2, 2, 5)</td></tr></table></p> 065 * 066 * @since 2.0 067 * @version $Revision: 799857 $ $Date: 2009-08-01 09:07:12 -0400 (Sat, 01 Aug 2009) $ 068 */ 069 public class NaturalRanking implements RankingAlgorithm { 070 071 /** NaN strategy - defaults to NaNs maximal */ 072 private final NaNStrategy nanStrategy; 073 074 /** Ties strategy - defaults to ties averaged */ 075 private final TiesStrategy tiesStrategy; 076 077 /** Source of random data - used only when ties strategy is RANDOM */ 078 private final RandomData randomData; 079 080 /** default NaN strategy */ 081 public static final NaNStrategy DEFAULT_NAN_STRATEGY = NaNStrategy.MAXIMAL; 082 083 /** default ties strategy */ 084 public static final TiesStrategy DEFAULT_TIES_STRATEGY = TiesStrategy.AVERAGE; 085 086 /** 087 * Create a NaturalRanking with default strategies for handling ties and NaNs. 088 */ 089 public NaturalRanking() { 090 super(); 091 tiesStrategy = DEFAULT_TIES_STRATEGY; 092 nanStrategy = DEFAULT_NAN_STRATEGY; 093 randomData = null; 094 } 095 096 /** 097 * Create a NaturalRanking with the given TiesStrategy. 098 * 099 * @param tiesStrategy the TiesStrategy to use 100 */ 101 public NaturalRanking(TiesStrategy tiesStrategy) { 102 super(); 103 this.tiesStrategy = tiesStrategy; 104 nanStrategy = DEFAULT_NAN_STRATEGY; 105 randomData = new RandomDataImpl(); 106 } 107 108 /** 109 * Create a NaturalRanking with the given NaNStrategy. 110 * 111 * @param nanStrategy the NaNStrategy to use 112 */ 113 public NaturalRanking(NaNStrategy nanStrategy) { 114 super(); 115 this.nanStrategy = nanStrategy; 116 tiesStrategy = DEFAULT_TIES_STRATEGY; 117 randomData = null; 118 } 119 120 /** 121 * Create a NaturalRanking with the given NaNStrategy and TiesStrategy. 122 * 123 * @param nanStrategy NaNStrategy to use 124 * @param tiesStrategy TiesStrategy to use 125 */ 126 public NaturalRanking(NaNStrategy nanStrategy, TiesStrategy tiesStrategy) { 127 super(); 128 this.nanStrategy = nanStrategy; 129 this.tiesStrategy = tiesStrategy; 130 randomData = new RandomDataImpl(); 131 } 132 133 /** 134 * Create a NaturalRanking with TiesStrategy.RANDOM and the given 135 * RandomGenerator as the source of random data. 136 * 137 * @param randomGenerator source of random data 138 */ 139 public NaturalRanking(RandomGenerator randomGenerator) { 140 super(); 141 this.tiesStrategy = TiesStrategy.RANDOM; 142 nanStrategy = DEFAULT_NAN_STRATEGY; 143 randomData = new RandomDataImpl(randomGenerator); 144 } 145 146 147 /** 148 * Create a NaturalRanking with the given NaNStrategy, TiesStrategy.RANDOM 149 * and the given source of random data. 150 * 151 * @param nanStrategy NaNStrategy to use 152 * @param randomGenerator source of random data 153 */ 154 public NaturalRanking(NaNStrategy nanStrategy, 155 RandomGenerator randomGenerator) { 156 super(); 157 this.nanStrategy = nanStrategy; 158 this.tiesStrategy = TiesStrategy.RANDOM; 159 randomData = new RandomDataImpl(randomGenerator); 160 } 161 162 /** 163 * Return the NaNStrategy 164 * 165 * @return returns the NaNStrategy 166 */ 167 public NaNStrategy getNanStrategy() { 168 return nanStrategy; 169 } 170 171 /** 172 * Return the TiesStrategy 173 * 174 * @return the TiesStrategy 175 */ 176 public TiesStrategy getTiesStrategy() { 177 return tiesStrategy; 178 } 179 180 /** 181 * Rank <code>data</code> using the natural ordering on Doubles, with 182 * NaN values handled according to <code>nanStrategy</code> and ties 183 * resolved using <code>tiesStrategy.</code> 184 * 185 * @param data array to be ranked 186 * @return array of ranks 187 */ 188 public double[] rank(double[] data) { 189 190 // Array recording initial positions of data to be ranked 191 IntDoublePair[] ranks = new IntDoublePair[data.length]; 192 for (int i = 0; i < data.length; i++) { 193 ranks[i] = new IntDoublePair(data[i], i); 194 } 195 196 // Recode, remove or record positions of NaNs 197 List<Integer> nanPositions = null; 198 switch (nanStrategy) { 199 case MAXIMAL: // Replace NaNs with +INFs 200 recodeNaNs(ranks, Double.POSITIVE_INFINITY); 201 break; 202 case MINIMAL: // Replace NaNs with -INFs 203 recodeNaNs(ranks, Double.NEGATIVE_INFINITY); 204 break; 205 case REMOVED: // Drop NaNs from data 206 ranks = removeNaNs(ranks); 207 break; 208 case FIXED: // Record positions of NaNs 209 nanPositions = getNanPositions(ranks); 210 break; 211 } 212 213 // Sort the IntDoublePairs 214 Arrays.sort(ranks); 215 216 // Walk the sorted array, filling output array using sorted positions, 217 // resolving ties as we go 218 double[] out = new double[ranks.length]; 219 int pos = 1; // position in sorted array 220 out[ranks[0].getPosition()] = pos; 221 List<Integer> tiesTrace = new ArrayList<Integer>(); 222 tiesTrace.add(ranks[0].getPosition()); 223 for (int i = 1; i < ranks.length; i++) { 224 if (Double.compare(ranks[i].getValue(), ranks[i - 1].getValue()) > 0) { 225 // tie sequence has ended (or had length 1) 226 pos = i + 1; 227 if (tiesTrace.size() > 1) { // if seq is nontrivial, resolve 228 resolveTie(out, tiesTrace); 229 } 230 tiesTrace = new ArrayList<Integer>(); 231 tiesTrace.add(ranks[i].getPosition()); 232 } else { 233 // tie sequence continues 234 tiesTrace.add(ranks[i].getPosition()); 235 } 236 out[ranks[i].getPosition()] = pos; 237 } 238 if (tiesTrace.size() > 1) { // handle tie sequence at end 239 resolveTie(out, tiesTrace); 240 } 241 if (nanStrategy == NaNStrategy.FIXED) { 242 restoreNaNs(out, nanPositions); 243 } 244 return out; 245 } 246 247 /** 248 * Returns an array that is a copy of the input array with IntDoublePairs 249 * having NaN values removed. 250 * 251 * @param ranks input array 252 * @return array with NaN-valued entries removed 253 */ 254 private IntDoublePair[] removeNaNs(IntDoublePair[] ranks) { 255 if (!containsNaNs(ranks)) { 256 return ranks; 257 } 258 IntDoublePair[] outRanks = new IntDoublePair[ranks.length]; 259 int j = 0; 260 for (int i = 0; i < ranks.length; i++) { 261 if (Double.isNaN(ranks[i].getValue())) { 262 // drop, but adjust original ranks of later elements 263 for (int k = i + 1; k < ranks.length; k++) { 264 ranks[k] = new IntDoublePair( 265 ranks[k].getValue(), ranks[k].getPosition() - 1); 266 } 267 } else { 268 outRanks[j] = new IntDoublePair( 269 ranks[i].getValue(), ranks[i].getPosition()); 270 j++; 271 } 272 } 273 IntDoublePair[] returnRanks = new IntDoublePair[j]; 274 System.arraycopy(outRanks, 0, returnRanks, 0, j); 275 return returnRanks; 276 } 277 278 /** 279 * Recodes NaN values to the given value. 280 * 281 * @param ranks array to recode 282 * @param value the value to replace NaNs with 283 */ 284 private void recodeNaNs(IntDoublePair[] ranks, double value) { 285 for (int i = 0; i < ranks.length; i++) { 286 if (Double.isNaN(ranks[i].getValue())) { 287 ranks[i] = new IntDoublePair( 288 value, ranks[i].getPosition()); 289 } 290 } 291 } 292 293 /** 294 * Checks for presence of NaNs in <code>ranks.</code> 295 * 296 * @param ranks array to be searched for NaNs 297 * @return true iff ranks contains one or more NaNs 298 */ 299 private boolean containsNaNs(IntDoublePair[] ranks) { 300 for (int i = 0; i < ranks.length; i++) { 301 if (Double.isNaN(ranks[i].getValue())) { 302 return true; 303 } 304 } 305 return false; 306 } 307 308 /** 309 * Resolve a sequence of ties, using the configured {@link TiesStrategy}. 310 * The input <code>ranks</code> array is expected to take the same value 311 * for all indices in <code>tiesTrace</code>. The common value is recoded 312 * according to the tiesStrategy. For example, if ranks = <5,8,2,6,2,7,1,2>, 313 * tiesTrace = <2,4,7> and tiesStrategy is MINIMUM, ranks will be unchanged. 314 * The same array and trace with tiesStrategy AVERAGE will come out 315 * <5,8,3,6,3,7,1,3>. 316 * 317 * @param ranks array of ranks 318 * @param tiesTrace list of indices where <code>ranks</code> is constant 319 * -- that is, for any i and j in TiesTrace, <code> ranks[i] == ranks[j] 320 * </code> 321 */ 322 private void resolveTie(double[] ranks, List<Integer> tiesTrace) { 323 324 // constant value of ranks over tiesTrace 325 final double c = ranks[tiesTrace.get(0)]; 326 327 // length of sequence of tied ranks 328 final int length = tiesTrace.size(); 329 330 switch (tiesStrategy) { 331 case AVERAGE: // Replace ranks with average 332 fill(ranks, tiesTrace, (2 * c + length - 1) / 2d); 333 break; 334 case MAXIMUM: // Replace ranks with maximum values 335 fill(ranks, tiesTrace, c + length - 1); 336 break; 337 case MINIMUM: // Replace ties with minimum 338 fill(ranks, tiesTrace, c); 339 break; 340 case RANDOM: // Fill with random integral values in [c, c + length - 1] 341 Iterator<Integer> iterator = tiesTrace.iterator(); 342 long f = Math.round(c); 343 while (iterator.hasNext()) { 344 ranks[iterator.next()] = 345 randomData.nextLong(f, f + length - 1); 346 } 347 break; 348 case SEQUENTIAL: // Fill sequentially from c to c + length - 1 349 // walk and fill 350 iterator = tiesTrace.iterator(); 351 f = Math.round(c); 352 int i = 0; 353 while (iterator.hasNext()) { 354 ranks[iterator.next()] = f + i++; 355 } 356 break; 357 } 358 } 359 360 /** 361 * Sets<code>data[i] = value</code> for each i in <code>tiesTrace.</code> 362 * 363 * @param data array to modify 364 * @param tiesTrace list of index values to set 365 * @param value value to set 366 */ 367 private void fill(double[] data, List<Integer> tiesTrace, double value) { 368 Iterator<Integer> iterator = tiesTrace.iterator(); 369 while (iterator.hasNext()) { 370 data[iterator.next()] = value; 371 } 372 } 373 374 /** 375 * Set <code>ranks[i] = Double.NaN</code> for each i in <code>nanPositions.</code> 376 * 377 * @param ranks array to modify 378 * @param nanPositions list of index values to set to <code>Double.NaN</code> 379 */ 380 private void restoreNaNs(double[] ranks, List<Integer> nanPositions) { 381 if (nanPositions.size() == 0) { 382 return; 383 } 384 Iterator<Integer> iterator = nanPositions.iterator(); 385 while (iterator.hasNext()) { 386 ranks[iterator.next().intValue()] = Double.NaN; 387 } 388 389 } 390 391 /** 392 * Returns a list of indexes where <code>ranks</code> is <code>NaN.</code> 393 * 394 * @param ranks array to search for <code>NaNs</code> 395 * @return list of indexes i such that <code>ranks[i] = NaN</code> 396 */ 397 private List<Integer> getNanPositions(IntDoublePair[] ranks) { 398 ArrayList<Integer> out = new ArrayList<Integer>(); 399 for (int i = 0; i < ranks.length; i++) { 400 if (Double.isNaN(ranks[i].getValue())) { 401 out.add(Integer.valueOf(i)); 402 } 403 } 404 return out; 405 } 406 407 /** 408 * Represents the position of a double value in an ordering. 409 * Comparable interface is implemented so Arrays.sort can be used 410 * to sort an array of IntDoublePairs by value. Note that the 411 * implicitly defined natural ordering is NOT consistent with equals. 412 */ 413 private static class IntDoublePair implements Comparable<IntDoublePair> { 414 415 /** Value of the pair */ 416 final private double value; 417 418 /** Original position of the pair */ 419 final private int position; 420 421 /** 422 * Construct an IntDoublePair with the given value and position. 423 * @param value the value of the pair 424 * @param position the original position 425 */ 426 public IntDoublePair(double value, int position) { 427 this.value = value; 428 this.position = position; 429 } 430 431 /** 432 * Compare this IntDoublePair to another pair. 433 * Only the <strong>values</strong> are compared. 434 * 435 * @param other the other pair to compare this to 436 * @return result of <code>Double.compare(value, other.value)</code> 437 */ 438 public int compareTo(IntDoublePair other) { 439 return Double.compare(value, other.value); 440 } 441 442 /** 443 * Returns the value of the pair. 444 * @return value 445 */ 446 public double getValue() { 447 return value; 448 } 449 450 /** 451 * Returns the original position of the pair. 452 * @return position 453 */ 454 public int getPosition() { 455 return position; 456 } 457 } 458 }