AllianceDB  0.0.1
AllianceDB is an open-source suite, including benchmarks and libs for evaluating and improving stream operation algorithms on modern hardwares.
MicroDataSet.hpp
1 
2 //Copyright (C) 2022 by the IntelliStream team (https://github.com/intellistream)
3 // Created by tony on 03/03/22.
4 //
5 
6 #ifndef _UTILS_MICRODATASET_H_
7 #define _UTILS_MICRODATASET_H_
8 #pragma once
9 #include <stdint.h>
10 #include <vector>
11 #include <stddef.h>
12 #include <stdlib.h>
13 #include <time.h>
14 #include <random>
15 #include <cmath>
16 #include <iostream>
17 using namespace std;
18 namespace INTELLI {
44 class MicroDataSet {
45  private:
46  std::random_device rd;
47  std::default_random_engine e1;
48  bool hasSeed = false;
49  uint64_t seed;
50  public:
55 
56  }
61  MicroDataSet(uint64_t _seed) {
62  seed = _seed;
63  hasSeed = true;
64  }
65  ~MicroDataSet() {}
76  template<class dType=uint32_t>
77  vector<dType> genIncrementalAlphabet(size_t len) {
78  vector<dType> ru(len);
79  /* populate */
80  for (size_t i = 0; i < len; i++) {
81  ru[i] = i + 1; /* don't let 0 be in the alphabet */
82  }
83  return ru;
84  }
93  template<class tsType=size_t>
94  vector<tsType> genZipfInt(size_t len, tsType maxV, double fac) {
95  vector<tsType> ret(len);
96  vector<tsType> alphabet = genIncrementalAlphabet<tsType>(maxV);
97  std::mt19937_64 gen;
98  if (!hasSeed) {
99  gen = std::mt19937_64(rd()); // 以 rd() 播种的标准 mersenne_twister_engine
100  } else {
101  gen = std::mt19937_64(seed);
102  }
103 
104  std::uniform_real_distribution<> dis(0, 1);
105  vector<double> lut = genZipfLut<double>(maxV, fac);
106  for (size_t i = 0; i < len; i++) {
107  /* take random number */
108  double r = dis(gen);
109  /* binary search in lookup table to determine item */
110  size_t left = 0;
111  size_t right = maxV - 1;
112  size_t m; /* middle between left and right */
113  size_t pos; /* position to take */
114 
115  if (lut[0] >= r)
116  pos = 0;
117  else {
118  while (right - left > 1) {
119  m = (left + right) / 2;
120 
121  if (lut[m] < r)
122  left = m;
123  else
124  right = m;
125  }
126 
127  pos = right;
128  }
129  ret[i] = alphabet[pos];
130  }
131  return ret;
132  }
147  template<class tsType=uint32_t, class genType=std::mt19937>
148  vector<tsType> genRandInt(size_t len, tsType maxV, tsType minV = 0) {
149  genType gen;
150  if (!hasSeed) {
151  gen = genType(rd());
152  } else {
153  gen = genType(seed);
154  }
155  std::uniform_int_distribution<> dis(minV, maxV);
156  vector<tsType> ret(len);
157  for (size_t i = 0; i < len; i++) {
158  ret[i] = (tsType) dis(gen);
159  }
160  return ret;
161  }
169  template<class dType=double>
170  vector<dType> genZipfLut(size_t len, dType fac) {
171  dType scaling_factor;
172  dType sum;
173  vector<dType> lut(len);
174  /*
175  * Compute scaling factor such that
176  *
177  * sum (lut[i], i=1..alphabet_size) = 1.0
178  *
179  */
180  scaling_factor = 0.0;
181  for (size_t i = 1; i <= len; i++) { scaling_factor += 1.0 / pow(i, fac); }
182  /*
183  * Generate the lookup table
184  */
185  sum = 0.0;
186  for (size_t i = 1; i <= len; i++) {
187  sum += 1.0 / std::pow(i, fac);
188  lut[i - 1] = sum / scaling_factor;
189  }
190  return lut;
191  }
192 
209  template<class tsType=size_t>
210  vector<tsType> genSmoothTimeStamp(size_t len, size_t step, size_t interval) {
211  vector<tsType> ret(len);
212  tsType ts = 0;
213  for (auto i = 0; i < len; i++) {
214  if (i % (step) == 0) {
215  ts += interval;
216  }
217  ret[i] = ts;
218  }
219  return ret;
220  }
230  template<class tsType=size_t>
231  vector<tsType> genZipfTimeStamp(size_t len, tsType maxTime, double fac) {
232  vector<tsType> ret = genZipfInt<tsType>(len, maxTime, fac);
233  std::sort(ret.begin(), ret.end()); //just incremental re-arrange
234  return ret;
235  }
239 };
240 }
245 #endif //ALIANCEDB_INCLUDE_UTILS_MICRODATASET_H_
The all-in-one class for the Micro dataset.
Definition: MicroDataSet.hpp:44
MicroDataSet(uint64_t _seed)
construction with seed
Definition: MicroDataSet.hpp:61
MicroDataSet()
default construction, with auto random generator
Definition: MicroDataSet.hpp:54
vector< tsType > genZipfInt(size_t len, tsType maxV, double fac)
The function to generate a vector of integers which has zipf distribution.
Definition: MicroDataSet.hpp:94
vector< tsType > genRandInt(size_t len, tsType maxV, tsType minV=0)
generate the vector of random integer
Definition: MicroDataSet.hpp:148
vector< dType > genZipfLut(size_t len, dType fac)
To generate the zipf Lut.
Definition: MicroDataSet.hpp:170
vector< dType > genIncrementalAlphabet(size_t len)
To generate incremental alphabet, starting from 0 and end at len.
Definition: MicroDataSet.hpp:77
vector< tsType > genSmoothTimeStamp(size_t len, size_t step, size_t interval)
The function to generate a vector of timestamp which grows smoothly.
Definition: MicroDataSet.hpp:210
vector< tsType > genZipfTimeStamp(size_t len, tsType maxTime, double fac)
The function to generate a vector of timestamp which has zipf distribution.
Definition: MicroDataSet.hpp:231
Definition: DatasetTool.h:10