random_shuffle.h

Go to the documentation of this file.
00001 // -*- C++ -*-
00002 
00003 // Copyright (C) 2007, 2008 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the terms
00007 // of the GNU General Public License as published by the Free Software
00008 // Foundation; either version 2, or (at your option) any later
00009 // version.
00010 
00011 // This library is distributed in the hope that it will be useful, but
00012 // WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014 // General Public License for more details.
00015 
00016 // You should have received a copy of the GNU General Public License
00017 // along with this library; see the file COPYING.  If not, write to
00018 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
00019 // MA 02111-1307, USA.
00020 
00021 // As a special exception, you may use this file as part of a free
00022 // software library without restriction.  Specifically, if other files
00023 // instantiate templates or use macros or inline functions from this
00024 // file, or you compile this file and link it with other files to
00025 // produce an executable, this file does not by itself cause the
00026 // resulting executable to be covered by the GNU General Public
00027 // License.  This exception does not however invalidate any other
00028 // reasons why the executable file might be covered by the GNU General
00029 // Public License.
00030 
00031 /** @file parallel/random_shuffle.h
00032  *  @brief Parallel implementation of std::random_shuffle().
00033  *  This file is a GNU parallel extension to the Standard C++ Library.
00034  */
00035 
00036 // Written by Johannes Singler.
00037 
00038 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
00039 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
00040 
00041 #include <limits>
00042 #include <bits/stl_numeric.h>
00043 #include <parallel/parallel.h>
00044 #include <parallel/random_number.h>
00045 
00046 namespace __gnu_parallel
00047 {
00048 /** @brief Type to hold the index of a bin.
00049   *
00050   *  Since many variables of this type are allocated, it should be
00051   *  chosen as small as possible.
00052   */
00053 typedef unsigned short bin_index;
00054 
00055 /** @brief Data known to every thread participating in
00056     __gnu_parallel::parallel_random_shuffle(). */
00057 template<typename RandomAccessIterator>
00058   struct DRandomShufflingGlobalData
00059   {
00060     typedef std::iterator_traits<RandomAccessIterator> traits_type;
00061     typedef typename traits_type::value_type value_type;
00062     typedef typename traits_type::difference_type difference_type;
00063 
00064     /** @brief Begin iterator of the source. */
00065     RandomAccessIterator& source;
00066 
00067     /** @brief Temporary arrays for each thread. */
00068     value_type** temporaries;
00069 
00070     /** @brief Two-dimensional array to hold the thread-bin distribution.
00071      *
00072      *  Dimensions (num_threads + 1) x (num_bins + 1). */
00073     difference_type** dist;
00074 
00075     /** @brief Start indexes of the threads' chunks. */
00076     difference_type* starts;
00077 
00078     /** @brief Number of the thread that will further process the
00079     corresponding bin. */
00080     thread_index_t* bin_proc;
00081 
00082     /** @brief Number of bins to distribute to. */
00083     int num_bins;
00084 
00085     /** @brief Number of bits needed to address the bins. */
00086     int num_bits;
00087 
00088     /** @brief Constructor. */
00089     DRandomShufflingGlobalData(RandomAccessIterator& _source)
00090     : source(_source) { }
00091   };
00092 
00093 /** @brief Local data for a thread participating in
00094     __gnu_parallel::parallel_random_shuffle().
00095   */
00096 template<typename RandomAccessIterator, typename RandomNumberGenerator>
00097   struct DRSSorterPU
00098   {
00099     /** @brief Number of threads participating in total. */
00100     int num_threads;
00101 
00102     /** @brief Begin index for bins taken care of by this thread. */
00103     bin_index bins_begin;
00104 
00105     /** @brief End index for bins taken care of by this thread. */
00106     bin_index bins_end;
00107 
00108     /** @brief Random seed for this thread. */
00109     uint32 seed;
00110 
00111     /** @brief Pointer to global data. */
00112     DRandomShufflingGlobalData<RandomAccessIterator>* sd;
00113   };
00114 
00115 /** @brief Generate a random number in @c [0,2^logp).
00116   *  @param logp Logarithm (basis 2) of the upper range bound.
00117   *  @param rng Random number generator to use.
00118   */
00119 template<typename RandomNumberGenerator>
00120   inline int
00121   random_number_pow2(int logp, RandomNumberGenerator& rng)
00122   { return rng.genrand_bits(logp); }
00123 
00124 /** @brief Random shuffle code executed by each thread.
00125   *  @param pus Array of thread-local data records. */
00126 template<typename RandomAccessIterator, typename RandomNumberGenerator>
00127   void 
00128   parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
00129                                  RandomNumberGenerator>* pus)
00130   {
00131     typedef std::iterator_traits<RandomAccessIterator> traits_type;
00132     typedef typename traits_type::value_type value_type;
00133     typedef typename traits_type::difference_type difference_type;
00134 
00135     thread_index_t iam = omp_get_thread_num();
00136     DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
00137     DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
00138 
00139     // Indexing: dist[bin][processor]
00140     difference_type length = sd->starts[iam + 1] - sd->starts[iam];
00141     bin_index* oracles = new bin_index[length];
00142     difference_type* dist = new difference_type[sd->num_bins + 1];
00143     bin_index* bin_proc = new bin_index[sd->num_bins];
00144     value_type** temporaries = new value_type*[d->num_threads];
00145 
00146     // Compute oracles and count appearances.
00147     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
00148       dist[b] = 0;
00149     int num_bits = sd->num_bits;
00150 
00151     random_number rng(d->seed);
00152 
00153     // First main loop.
00154     for (difference_type i = 0; i < length; ++i)
00155       {
00156         bin_index oracle = random_number_pow2(num_bits, rng);
00157         oracles[i] = oracle;
00158 
00159         // To allow prefix (partial) sum.
00160         ++(dist[oracle + 1]);
00161       }
00162 
00163     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
00164       sd->dist[b][iam + 1] = dist[b];
00165 
00166 #   pragma omp barrier
00167 
00168 #   pragma omp single
00169     {
00170       // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
00171       // total number of items in bin s
00172       for (bin_index s = 0; s < sd->num_bins; ++s)
00173         __gnu_sequential::partial_sum(sd->dist[s + 1],
00174                                       sd->dist[s + 1] + d->num_threads + 1,
00175                                       sd->dist[s + 1]);
00176     }
00177 
00178 #   pragma omp barrier
00179 
00180     sequence_index_t offset = 0, global_offset = 0;
00181     for (bin_index s = 0; s < d->bins_begin; ++s)
00182       global_offset += sd->dist[s + 1][d->num_threads];
00183 
00184 #   pragma omp barrier
00185 
00186     for (bin_index s = d->bins_begin; s < d->bins_end; ++s)
00187       {
00188     for (int t = 0; t < d->num_threads + 1; ++t)
00189       sd->dist[s + 1][t] += offset;
00190     offset = sd->dist[s + 1][d->num_threads];
00191       }
00192 
00193     sd->temporaries[iam] = static_cast<value_type*>(
00194       ::operator new(sizeof(value_type) * offset));
00195 
00196 #   pragma omp barrier
00197 
00198     // Draw local copies to avoid false sharing.
00199     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
00200       dist[b] = sd->dist[b][iam];
00201     for (bin_index b = 0; b < sd->num_bins; ++b)
00202       bin_proc[b] = sd->bin_proc[b];
00203     for (thread_index_t t = 0; t < d->num_threads; ++t)
00204       temporaries[t] = sd->temporaries[t];
00205 
00206     RandomAccessIterator source = sd->source;
00207     difference_type start = sd->starts[iam];
00208 
00209     // Distribute according to oracles, second main loop.
00210     for (difference_type i = 0; i < length; ++i)
00211       {
00212         bin_index target_bin = oracles[i];
00213         thread_index_t target_p = bin_proc[target_bin];
00214 
00215         // Last column [d->num_threads] stays unchanged.
00216         ::new(&(temporaries[target_p][dist[target_bin + 1]++]))
00217         value_type(*(source + i + start));
00218       }
00219 
00220     delete[] oracles;
00221     delete[] dist;
00222     delete[] bin_proc;
00223     delete[] temporaries;
00224 
00225 #   pragma omp barrier
00226 
00227     // Shuffle bins internally.
00228     for (bin_index b = d->bins_begin; b < d->bins_end; ++b)
00229       {
00230         value_type* begin =
00231                     sd->temporaries[iam] +
00232                     ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
00233                   * end =
00234                     sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
00235         sequential_random_shuffle(begin, end, rng);
00236         std::copy(begin, end, sd->source + global_offset +
00237             ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
00238       }
00239 
00240     ::operator delete(sd->temporaries[iam]);
00241   }
00242 
00243 /** @brief Round up to the next greater power of 2.
00244   *  @param x Integer to round up */
00245 template<typename T>
00246   T 
00247   round_up_to_pow2(T x)
00248   {
00249     if (x <= 1)
00250       return 1;
00251     else
00252       return (T)1 << (log2(x - 1) + 1);
00253   }
00254 
00255 /** @brief Main parallel random shuffle step.
00256   *  @param begin Begin iterator of sequence.
00257   *  @param end End iterator of sequence.
00258   *  @param n Length of sequence.
00259   *  @param num_threads Number of threads to use.
00260   *  @param rng Random number generator to use.
00261   */
00262 template<typename RandomAccessIterator, typename RandomNumberGenerator>
00263   void
00264   parallel_random_shuffle_drs(RandomAccessIterator begin,
00265                   RandomAccessIterator end,
00266                   typename std::iterator_traits
00267                   <RandomAccessIterator>::difference_type n,
00268                   thread_index_t num_threads,
00269                   RandomNumberGenerator& rng)
00270   {
00271     typedef std::iterator_traits<RandomAccessIterator> traits_type;
00272     typedef typename traits_type::value_type value_type;
00273     typedef typename traits_type::difference_type difference_type;
00274 
00275     _GLIBCXX_CALL(n)
00276 
00277     const _Settings& __s = _Settings::get();
00278 
00279     if (num_threads > n)
00280       num_threads = static_cast<thread_index_t>(n);
00281 
00282     bin_index num_bins, num_bins_cache;
00283 
00284 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
00285     // Try the L1 cache first.
00286 
00287     // Must fit into L1.
00288     num_bins_cache = std::max<difference_type>(
00289         1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
00290     num_bins_cache = round_up_to_pow2(num_bins_cache);
00291 
00292     // No more buckets than TLB entries, power of 2
00293     // Power of 2 and at least one element per bin, at most the TLB size.
00294     num_bins = std::min<difference_type>(n, num_bins_cache);
00295 
00296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
00297     // 2 TLB entries needed per bin.
00298     num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins);
00299 #endif
00300     num_bins = round_up_to_pow2(num_bins);
00301 
00302     if (num_bins < num_bins_cache)
00303       {
00304 #endif
00305         // Now try the L2 cache
00306         // Must fit into L2
00307         num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
00308             1, n / (__s.L2_cache_size / sizeof(value_type))));
00309         num_bins_cache = round_up_to_pow2(num_bins_cache);
00310 
00311         // No more buckets than TLB entries, power of 2.
00312         num_bins = static_cast<bin_index>(
00313             std::min(n, static_cast<difference_type>(num_bins_cache)));
00314         // Power of 2 and at least one element per bin, at most the TLB size.
00315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
00316         // 2 TLB entries needed per bin.
00317         num_bins = std::min(
00318             static_cast<difference_type>(__s.TLB_size / 2), num_bins);
00319 #endif
00320           num_bins = round_up_to_pow2(num_bins);
00321 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
00322       }
00323 #endif
00324 
00325     num_threads = std::min<bin_index>(num_threads, num_bins);
00326 
00327     if (num_threads <= 1)
00328       return sequential_random_shuffle(begin, end, rng);
00329 
00330     DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
00331     DRSSorterPU<RandomAccessIterator, random_number >* pus;
00332     difference_type* starts;
00333 
00334 #   pragma omp parallel num_threads(num_threads)
00335       {
00336         thread_index_t num_threads = omp_get_num_threads();
00337 #       pragma omp single
00338           {
00339             pus = new DRSSorterPU<RandomAccessIterator, random_number>
00340                 [num_threads];
00341 
00342             sd.temporaries = new value_type*[num_threads];
00343             sd.dist = new difference_type*[num_bins + 1];
00344             sd.bin_proc = new thread_index_t[num_bins];
00345             for (bin_index b = 0; b < num_bins + 1; ++b)
00346               sd.dist[b] = new difference_type[num_threads + 1];
00347             for (bin_index b = 0; b < (num_bins + 1); ++b)
00348               {
00349                 sd.dist[0][0] = 0;
00350                 sd.dist[b][0] = 0;
00351               }
00352             starts = sd.starts = new difference_type[num_threads + 1];
00353             int bin_cursor = 0;
00354             sd.num_bins = num_bins;
00355             sd.num_bits = log2(num_bins);
00356 
00357             difference_type chunk_length = n / num_threads,
00358                             split = n % num_threads, start = 0;
00359             difference_type bin_chunk_length = num_bins / num_threads,
00360                             bin_split = num_bins % num_threads;
00361             for (thread_index_t i = 0; i < num_threads; ++i)
00362               {
00363                 starts[i] = start;
00364                 start += (i < split) ? (chunk_length + 1) : chunk_length;
00365                 int j = pus[i].bins_begin = bin_cursor;
00366 
00367                 // Range of bins for this processor.
00368                 bin_cursor += (i < bin_split) ?
00369                     (bin_chunk_length + 1) : bin_chunk_length;
00370                 pus[i].bins_end = bin_cursor;
00371                 for (; j < bin_cursor; ++j)
00372                   sd.bin_proc[j] = i;
00373                 pus[i].num_threads = num_threads;
00374                 pus[i].seed = rng(std::numeric_limits<uint32>::max());
00375                 pus[i].sd = &sd;
00376               }
00377             starts[num_threads] = start;
00378           } //single
00379         // Now shuffle in parallel.
00380         parallel_random_shuffle_drs_pu(pus);
00381       }  // parallel
00382 
00383     delete[] starts;
00384     delete[] sd.bin_proc;
00385     for (int s = 0; s < (num_bins + 1); ++s)
00386       delete[] sd.dist[s];
00387     delete[] sd.dist;
00388     delete[] sd.temporaries;
00389 
00390     delete[] pus;
00391   }
00392 
00393 /** @brief Sequential cache-efficient random shuffle.
00394  *  @param begin Begin iterator of sequence.
00395  *  @param end End iterator of sequence.
00396  *  @param rng Random number generator to use.
00397  */
00398 template<typename RandomAccessIterator, typename RandomNumberGenerator>
00399   void
00400   sequential_random_shuffle(RandomAccessIterator begin, 
00401                             RandomAccessIterator end,
00402                             RandomNumberGenerator& rng)
00403   {
00404     typedef std::iterator_traits<RandomAccessIterator> traits_type;
00405     typedef typename traits_type::value_type value_type;
00406     typedef typename traits_type::difference_type difference_type;
00407 
00408     difference_type n = end - begin;
00409     const _Settings& __s = _Settings::get();
00410 
00411     bin_index num_bins, num_bins_cache;
00412 
00413 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
00414     // Try the L1 cache first, must fit into L1.
00415     num_bins_cache =
00416         std::max<difference_type>
00417             (1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
00418     num_bins_cache = round_up_to_pow2(num_bins_cache);
00419 
00420     // No more buckets than TLB entries, power of 2
00421     // Power of 2 and at least one element per bin, at most the TLB size
00422     num_bins = std::min(n, (difference_type)num_bins_cache);
00423 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
00424     // 2 TLB entries needed per bin
00425     num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins);
00426 #endif
00427     num_bins = round_up_to_pow2(num_bins);
00428 
00429     if (num_bins < num_bins_cache)
00430       {
00431 #endif
00432         // Now try the L2 cache, must fit into L2.
00433         num_bins_cache =
00434             static_cast<bin_index>(std::max<difference_type>(
00435                 1, n / (__s.L2_cache_size / sizeof(value_type))));
00436         num_bins_cache = round_up_to_pow2(num_bins_cache);
00437 
00438         // No more buckets than TLB entries, power of 2
00439         // Power of 2 and at least one element per bin, at most the TLB size.
00440         num_bins = static_cast<bin_index>
00441             (std::min(n, static_cast<difference_type>(num_bins_cache)));
00442 
00443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
00444         // 2 TLB entries needed per bin
00445         num_bins =
00446             std::min<difference_type>(__s.TLB_size / 2, num_bins);
00447 #endif
00448         num_bins = round_up_to_pow2(num_bins);
00449 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
00450       }
00451 #endif
00452 
00453     int num_bits = log2(num_bins);
00454 
00455     if (num_bins > 1)
00456       {
00457         value_type* target = static_cast<value_type*>(
00458           ::operator new(sizeof(value_type) * n));
00459         bin_index* oracles = new bin_index[n];
00460         difference_type* dist0 = new difference_type[num_bins + 1],
00461                        * dist1 = new difference_type[num_bins + 1];
00462 
00463         for (int b = 0; b < num_bins + 1; ++b)
00464           dist0[b] = 0;
00465 
00466         random_number bitrng(rng(0xFFFFFFFF));
00467 
00468         for (difference_type i = 0; i < n; ++i)
00469           {
00470             bin_index oracle = random_number_pow2(num_bits, bitrng);
00471             oracles[i] = oracle;
00472 
00473             // To allow prefix (partial) sum.
00474             ++(dist0[oracle + 1]);
00475           }
00476 
00477         // Sum up bins.
00478         __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
00479 
00480         for (int b = 0; b < num_bins + 1; ++b)
00481           dist1[b] = dist0[b];
00482 
00483         // Distribute according to oracles.
00484         for (difference_type i = 0; i < n; ++i)
00485           ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i));
00486 
00487         for (int b = 0; b < num_bins; ++b)
00488           {
00489             sequential_random_shuffle(target + dist1[b],
00490                                       target + dist1[b + 1],
00491                                       rng);
00492           }
00493 
00494         // Copy elements back.
00495         std::copy(target, target + n, begin);
00496 
00497         delete[] dist0;
00498         delete[] dist1;
00499         delete[] oracles;
00500         ::operator delete(target);
00501       }
00502     else
00503       __gnu_sequential::random_shuffle(begin, end, rng);
00504   }
00505 
00506 /** @brief Parallel random public call.
00507  *  @param begin Begin iterator of sequence.
00508  *  @param end End iterator of sequence.
00509  *  @param rng Random number generator to use.
00510  */
00511 template<typename RandomAccessIterator, typename RandomNumberGenerator>
00512   inline void
00513   parallel_random_shuffle(RandomAccessIterator begin,
00514                           RandomAccessIterator end,
00515                           RandomNumberGenerator rng = random_number())
00516   {
00517     typedef std::iterator_traits<RandomAccessIterator> traits_type;
00518     typedef typename traits_type::difference_type difference_type;
00519     difference_type n = end - begin;
00520     parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
00521   }
00522 
00523 }
00524 
00525 #endif

Generated on Sat Dec 12 09:40:13 2009 for libstdc++ by  doxygen 1.5.6