kmean算法C++实现

kmean均值算法是一种最常见的聚类算法。算法实现简单，效果也比较好。kmean算法把n个对象划分成指定的k个簇，每个簇中所有对象的均值的平均值为该簇的聚点（中心）。

k均值算法有如下五个步骤：

随机生成最初始k个簇心。可以从样本中随机选择，也可以根据样本中每个特征的取值特点随机生成。
对每个样本计算到每个簇心的欧式距离，将样本划分到欧氏距离最小的簇心（聚点）。
对划分到同一个簇心（聚点）的样本计算平均值，用均值更新簇心（聚点）
若某些簇心（聚点）发生变化，转到2；若所有的聚点都没有变化，转5
输出划分结果

  1 #include <vector>
  2 #include <cassert>
  3 #include <iostream>
  4 #include <cmath>
  5 #include <fstream>
  6 #include <climits>
  7 #include <ctime>
  8 #include <iomanip>
  9 
 10 using namespace std;
 11 namespace terse {
 12 class Kmeans {
 13 private:
 14     vector<vector<double>> m_dataSet;
 15     int m_k;
 16     vector<int> m_clusterResult;         // result of cluster
 17     vector<vector<double>> m_cluserCent; //center of k clusters
 18 
 19 private:
 20     vector<string> split(const string& s, string pattern) {
 21         vector<string> res;
 22         size_t start = 0;
 23         size_t end = 0;
 24         while (start < s.size()) {
 25             end = s.find_first_of(pattern, start);
 26             if (end == string::npos) {
 27                 res.push_back(s.substr(start, end - start - 1));
 28                 return res;
 29             }
 30             res.push_back(s.substr(start, end - start));
 31             start = end + 1;
 32         }
 33         return res;
 34     }
 35 
 36     void loadDataSet(const char* fileName) {
 37         ifstream dataFile(fileName);
 38         if (!dataFile.is_open()) {
 39             cerr << "open file " << fileName << "failed!\n";
 40             return;
 41         }
 42         string tmpstr;
 43         vector<double> data;
 44         while (!dataFile.eof()) {
 45             data.clear();
 46             tmpstr.clear();
 47             getline(dataFile, tmpstr);
 48             vector<string> tmp = split(tmpstr, ",");
 49             for (string str : tmp) {
 50                 data.push_back(stod(str));
 51             }
 52             this->m_dataSet.push_back(data);
 53         }
 54         dataFile.close();
 55     }
 56 
 57     //compute Euclidean distance of two vector
 58     double distEclud(vector<double>& v1, vector<double>& v2) {
 59         assert(v1.size() == v2.size());
 60         double dist = 0;
 61         for (size_t i = 0; i < v1.size(); i++) {
 62             dist += (v1[i] - v2[i]) * (v1[i] - v2[i]);
 63         }
 64         return sqrt(dist);
 65     }
 66 
 67     void generateRandCent() {
 68         int numOfFeats = this->m_dataSet[0].size();
 69         size_t numOfSamples = this->m_dataSet.size();
 70 
 71         //first:min second：max
 72         vector<pair<double, double>> minMaxOfFeat(numOfFeats);
 73         for (int i = 0; i < numOfFeats; i++) {
 74             minMaxOfFeat[i].first = this->m_dataSet[0][i];
 75             minMaxOfFeat[i].second = this->m_dataSet[0][i];
 76         }
 77         for (size_t i = 1; i < numOfSamples; i++) {
 78             for (int j = 0; j < numOfFeats; j++) {
 79                 if (this->m_dataSet[i][j] > minMaxOfFeat[j].second) {
 80                     minMaxOfFeat[j].second = this->m_dataSet[i][j];
 81                 }
 82                 if (this->m_dataSet[i][j] < minMaxOfFeat[j].first) {
 83                     minMaxOfFeat[j].first = this->m_dataSet[i][j];
 84                 }
 85             }
 86         }
 87         srand(time(NULL));
 88         for (int i = 0; i < this->m_k; i++) {
 89             for (int j = 0; j < numOfFeats; j++) {
 90                 this->m_cluserCent[i][j] = minMaxOfFeat[j].first
 91                         + (minMaxOfFeat[j].second - minMaxOfFeat[j].first)
 92                                 * (rand() / (double) RAND_MAX);
 93             }
 94         }
 95 
 96     }
 97 
 98     void printClusterCent(int iter) {
 99         int m = this->m_cluserCent.size();
100         int n = this->m_cluserCent[0].size();
101         cout << "iter =  " << iter;
102         for (int i = 0; i < m; i++) {
103             cout << " {";
104             for (int j = 0; j < n; j++) {
105                 cout << this->m_cluserCent[i][j] << ",";
106             }
107             cout << "};";
108         }
109         cout << endl;
110     }
111 
112     void writeResult(const char* fileName = "res.txt") {
113         ofstream fout(fileName);
114         if (!fout.is_open()) {
115             cerr << "open file " << fileName << "failed!";
116             return;
117         }
118         for (size_t i = 0; i < this->m_dataSet.size(); i++) {
119             for (size_t j = 0; j < this->m_dataSet[0].size(); j++) {
120                 fout << this->m_dataSet[i][j] << "\t";
121             }
122             fout << setprecision(5) << this->m_clusterResult[i] << "\n";
123         }
124         fout.close();
125     }
126 
127 public:
128     Kmeans(int k, const char* fileName) {
129         this->m_k = k;
130         this->loadDataSet(fileName);
131         this->m_clusterResult.reserve(this->m_dataSet.size());
132         this->m_cluserCent = vector<vector<double>>(k,
133                 vector<double>(this->m_dataSet[0].size()));
134         generateRandCent();
135     }
136 
137     Kmeans(int k, vector<vector<double>>& data) {
138         this->m_k = k;
139         this->m_dataSet = data;
140         this->m_clusterResult.reserve(this->m_dataSet.size());
141         this->m_cluserCent = vector<vector<double>>(k,
142                 vector<double>(this->m_dataSet[0].size()));
143         generateRandCent();
144     }
145 
146     //verbose = 1,printClusterCent();
147     void kmeansCluster(int verbose = 1) {
148         int iter = 0;
149         bool isClusterChanged = true;
150         while (isClusterChanged) {
151             isClusterChanged = false;
152             //step 1: find the nearest centroid of each point
153             int numOfFeats = this->m_dataSet[0].size();
154             size_t numOfSamples = this->m_dataSet.size();
155             for (size_t i = 0; i < numOfSamples; i++) {
156                 int minIndex = -1;
157                 double minDist = INT_MAX;
158                 for (int j = 0; j < this->m_k; j++) {
159                     double dist = distEclud(this->m_cluserCent[j],
160                             m_dataSet[i]);
161                     if (dist < minDist) {
162                         minDist = dist;
163                         minIndex = j;
164                     }
165                 }
166                 if (m_clusterResult[i] != minIndex) {
167                     isClusterChanged = true;
168                     m_clusterResult[i] = minIndex;
169                 }
170             }
171 
172             //step 2: update cluster center
173             vector<size_t> cnt(this->m_k, 0);
174             this->m_cluserCent = vector<vector<double>>(this->m_k,
175                     vector<double>(numOfFeats, 0.0));
176             for (size_t i = 0; i < numOfSamples; i++) {
177                 for (int j = 0; j < numOfFeats; j++) {
178                     this->m_cluserCent[this->m_clusterResult[i]][j] +=
179                             this->m_dataSet[i][j];
180                 }
181                 cnt[this->m_clusterResult[i]]++;
182             }
183             // mean of the vector belong to a cluster
184             for (int i = 0; i < this->m_k; i++) {
185                 for (int j = 0; j < numOfFeats; j++) {
186                     this->m_cluserCent[i][j] /= cnt[i];
187                 }
188             }
189             if (verbose)
190                 printClusterCent(iter++);
191         }
192         writeResult();
193     }
194 };
195 
196 };
197 
198 int main(){
199     terse::Kmeans kmeans(4,"datafile.txt");
200     kmeans.kmeansCluster();
201     return 0;
202 }
203 /*namespace terse*/

转载于:https://www.cnblogs.com/wxquare/p/6754485.html

CSS控制文字超出部分省略号显示

一：单行文本显示不完时，我们只需要设置单行文本的宽度，不允许换行，溢出隐藏及换行省略四个属性即可： p{ max-width: 100px; overflow:hidden;

文章马伊琍共同现身

就当文章出轨事件逐渐淡出人们视野时，昨日有网友拍到了文章马伊琍夫妇合体现身某商场带女儿滑冰的照片。这是文章外遇门后首度和妻子马伊琍共同露面，不过两人似乎感觉貌合神离，有些生疏。而巧合的是就在同一天网友也拍到了姚笛在深圳拍开车戏的镜头。不少看客纷纷吐槽称三人各演各的，开始按剧情发展了。

PostgreSQL 12 版本

PostgreSQL全球开发组今天宣布，世界上功能最为强大的开源数据库发布PostgreSQL 12版本发布。 PostgreSQL 12版在各方面都得到了加强，包括显著地提升查询性能，特别是对大数据集，总的空间利用率方面。这个版本为应用程序开发人员

[数据仓库建模] 事实表的类型（可加，半可加，不可加）

众所周知，数据仓库的事实表中是有数字度量的，一般会根据这些数据度量以及提前规定好的一致性维度来进行统计等工作。那么事实表中的数组度量分三种：1.可加事实，2.半可加事实，3.不可加事实下面让我们来看看它们的定义与区别 1.可加事实

开源 vlc media player_Qt开源作品2-视频流播放vlc内核

## 一、前言上一个作品是ffmpeg内核做的，由于ffmpeg太过于强大，很多初学者会看的云里雾里懵逼状态，也有很多用户只需要一个简单的播放视频流即可，根本不需要涉及到负责的解码转码等，于是vlc就上场了，