10 #include "../base/streamingalgorithm.h"
11 #include "../base/dissimilaritymeasure.h"
12 #include "../base/solutionprovider.h"
13 #include "../base/weightmodifier.h"
14 #include "../base/partitionprovider.h"
15 #include "../clustering/cfrentry.h"
16 #include "../datastructure/proxysolution.h"
17 #include "../evaluation/kmeansevaluator.h"
18 #include "../exception/invalidruntimeconfigurationexception.h"
19 #include "../misc/randomness.h"
92 typename FeatureList::iterator
begin()
101 typename FeatureList::iterator
end()
131 typename FeatureList::iterator
nearest(T
const & element,
int level)
133 typename FeatureList::iterator minIt =
features.end();
138 double val =
outer.project(element, 0);
139 int bucket_number =
outer.calcBucketNumber(0, val);
141 int bucket_min = bucket_number;
144 if ((bucket_number < 0) || (bucket_number >
outer.buckets[0].size() - 1))
152 mins =
outer.buckets[mini][bucket_min].size();
153 for (
int i = 1; i <
outer.L; i++)
155 val =
outer.project(element, i);
156 bucket_number =
outer.calcBucketNumber(i, val);
157 if ((bucket_number >= 0) & (bucket_number <=
outer.buckets[i].size() - 1))
159 int s =
outer.buckets[i][bucket_number].size();
163 bucket_min = bucket_number;
170 bucket_min = bucket_number;
178 bucket_number = bucket_min;
181 if (bucket_number < 0)
184 outer.allocateBucket(rnd,
true);
186 else if (bucket_number >
outer.buckets[rnd].size() - 1)
189 outer.allocateBucket(rnd,
false);
196 for (
auto it =
outer.buckets[rnd][bucket_number].begin(); it !=
outer.buckets[rnd][bucket_number].end(); ++it)
198 double tmpDist =
outer.measure->dissimilarity((*it)->first.representative, element);
199 if (tmpDist < minDist || minDist == -1)
214 double tmpDist =
outer.measure->dissimilarity(it->first.representative, element);
215 if (tmpDist < minDist || minDist == -1)
230 void erase(
typename FeatureList::iterator pos)
319 void print(std::ostream& os);
334 void insertIntoNN(
typename BicoNode::FeatureList::iterator iteratorElement);
362 double project(T point,
int i);
395 double getT(
int level);
402 double getR(
int level);
428 std::vector<std::vector<std::vector<typename BicoNode::FeatureList::iterator >> >
buckets;
432 std::vector<std::pair<double, double >>
borders;
446 std::unique_ptr<DissimilarityMeasure<T >>
measure;
516 template<
class T1,
class T2>
bool operator()(std::pair<T1, T2>
const & left, std::pair<T1, T2>
const & right)
518 return P<T1>()(left.first, right.first);
522 template<
typename T>
Bico<T>::Bico(
size_t dim,
size_t n,
size_t k,
size_t p,
size_t nMax,
525 measure(measure->clone()),
526 weightModifier(weightModifier->clone()),
537 minDist(std::numeric_limits<double>::infinity()),
538 pairwise_different(0),
547 std::normal_distribution<double> realDist(0.0, 1.0);
548 for (
int i = 0; i <
L; i++)
554 rndpoint[j] = realDist(rg);
555 norm += rndpoint[j] * rndpoint[j];
557 norm = std::sqrt(norm);
571 return (
int) floor((val - borders[rnd].first) / bucket_radius[rnd]);
576 double maxBuckets = 10000;
578 for (
int i = 0; i < L; i++)
581 if (buckets[i].size() == 1)
587 bucket_radius[i] = (
long long int) ceil(sqrt(getR(1)));
588 Size = (int) ceil((borders[i].second - borders[i].first) / (double) bucket_radius[i]);
589 if(Size < 0 || Size > maxBuckets)
591 bucket_radius[i] = (borders[i].second - borders[i].first) / maxBuckets;
592 Size = (int) ceil((borders[i].second - borders[i].first) / (double) bucket_radius[i]);
595 for (
int l = 0; l < buckets[i].size(); l++) buckets[i][l].clear();
598 buckets[i].resize((
int) ceil(Size));
607 borders[bucket].first = 2 * borders[bucket].first - borders[bucket].second;
608 std::vector < std::vector<typename BicoNode::FeatureList::iterator >> a(2 * buckets[bucket].size());
609 for (
int i = 0; i < buckets[bucket].size(); i++)
611 a[buckets[bucket].size() + i] = buckets[bucket][i];
613 for (
int l = 0; l < buckets[bucket].size(); l++) buckets[bucket][l].clear();
614 buckets[bucket].clear();
620 borders[bucket].second = 2 * borders[bucket].second - borders[bucket].first;
621 std::vector < std::vector<typename BicoNode::FeatureList::iterator >> a(2 * buckets[bucket].size());
622 for (
int i = 0; i < buckets[bucket].size(); i++)
624 a[i] = buckets[bucket][i];
626 for (
int l = 0; l < buckets[bucket].size(); l++) buckets[bucket][l].clear();
627 buckets[bucket].clear();
635 for (
int j = 0; j < dimension; j++)
637 ip += point[j]*(rndprojections[i][j]);
651 result->
proxysets.push_back(std::vector<T>());
652 result->
proxysets[0].reserve(curNumOfCFs);
653 computeTraverse(root, result);
660 for (
auto it = node->
begin(); it != node->
end(); ++it)
662 T point(it->first.cog());
663 weightModifier->setWeight(point, it->first.number);
665 computeTraverse(it->second, solution);
674 for (
int i = 0; i < L; i++)
676 double val = std::abs(project(element, i));
677 if (val > maxVal[i] || maxVal[i] == -1)
684 buffer.push_back(element);
685 ++pairwise_different;
688 double projected = project(element, 0);
689 projection_buffer.push_back(std::pair<double, T const *>(projected, &buffer.back()));
692 if (pairwise_different >= maxNumOfCFs + 1)
696 double minProjDist = std::numeric_limits<double>::infinity();
697 double minProjRealDist = std::numeric_limits<double>::infinity();
698 for(
int i = 0; i < pairwise_different-2; ++i)
700 double tmpDist = projection_buffer[i+1].first - projection_buffer[i].first;
701 if(tmpDist < minProjRealDist)
703 minProjDist = tmpDist;
704 double tmpMinProjRealDist = measure->dissimilarity(*projection_buffer[i].second, *projection_buffer[i+1].second);
705 if(tmpMinProjRealDist > 0)
707 minProjRealDist = tmpMinProjRealDist;
715 double lowerEnd = projection_buffer[0].first;
716 double upperEnd = lowerEnd + minProjRealDist;
717 double minDist = minProjRealDist;
718 for(
int i = 0; i < pairwise_different-1; ++i)
720 if(projection_buffer[i].first >= upperEnd)
724 for(
int j = lowerIndex; j < upperIndex; ++j)
726 for(
int k = j+1; k < upperIndex; ++k)
729 double tmpDist = measure->dissimilarity(*projection_buffer[j].second, *projection_buffer[k].second);
730 if(tmpDist < minDist && tmpDist > 0)
738 lowerEnd = projection_buffer[i].first;
739 upperEnd = lowerEnd + minProjRealDist;
744 optEst = 16.0 * minDist;
747 long long int radius = (
long long int) ceil(sqrt(getR(1)));
749 for (
int i = 0; i < L; i++)
751 borders[i].first = -maxVal[i];
752 borders[i].second = maxVal[i];
753 bucket_radius[i] = radius;
758 for (
auto it = buffer.begin(); it != buffer.end(); ++it)
759 insert(root, 1, *it);
765 insert(root, 1, element);
771 for (
int i = 0; i < L; i++)
773 double val = project(iteratorElement->first.representative, i);
774 int bucket_number = calcBucketNumber(i, val);
776 if (bucket_number < 0)
778 while (bucket_number < 0)
780 allocateBucket(i,
true);
781 bucket_number = calcBucketNumber(i, val);
784 else if (bucket_number > buckets[i].size() - 1)
786 while (bucket_number > buckets[i].size() - 1)
788 allocateBucket(i,
false);
789 bucket_number = calcBucketNumber(i, val);
792 buckets[i][bucket_number].push_back(iteratorElement);
803 typename BicoNode::FeatureList::iterator nearest(node->
nearest(element, level));
808 if (node->
empty() || nearest == node->
end()
809 || measure->dissimilarity(nearest->first.representative, element) > getR(level))
811 CFREntry<T> feature(1, element, element * element, element);
812 typename BicoNode::FeatureList::iterator itele = node->
insert(feature);
823 T center(nearest->first.cog());
827 testFeature.insert(element);
828 double tfCost = testFeature.kMeansCost(center);
832 if (tfCost < getT(level))
834 nearest->first.insert(element);
838 insert(nearest->second, level + 1, element);
843 while (curNumOfCFs > maxNumOfCFs)
854 rebuildFirstLevel(this->root, oldRoot);
859 rebuildTraversMerge(this->root, 1);
871 auto nextIt = child->
begin();
872 for (
auto it = child->
begin(); it != child->
end(); it = nextIt)
876 typename BicoNode::FeatureList::iterator nearest(parent->
nearest(it->first.representative, 1));
877 if (parent->
empty() || nearest == parent->
end()
878 || measure->dissimilarity(nearest->first.representative, it->first.representative) > getR(1))
888 testFeature += nearest->first;
889 if (testFeature.
kMeansCost(nearest->first.representative) <= getT(1))
892 nearest->first += it->first;
894 it->second->spliceAllTo(nearest->second, nearest->second->end());
913 for (
auto parentIt = node->
begin(); parentIt != node->
end(); ++parentIt)
915 if (!parentIt->second->empty())
917 auto nextIt = parentIt->second->begin();
918 for (
auto childIt = parentIt->second->begin(); childIt != parentIt->second->end(); childIt = nextIt)
922 T center(parentIt->first.cog());
925 CFEntry<T> testFeature(parentIt->first + childIt->first);
926 double tfCost = testFeature.kMeansCost(center);
929 if (tfCost < getT(level))
931 parentIt->first += childIt->first;
932 childIt->second->spliceAllTo(parentIt->second, parentIt->second->end());
933 childIt->second->clear();
934 delete childIt->second;
935 parentIt->second->erase(childIt);
940 rebuildTraversMerge(childIt->second, level + 1);
954 return getT(level) / (double(1 << (3 + level)));
959 os <<
"digraph G{\n";
960 os <<
"node [shape=record];\n";
969 os <<
"node" <<
id <<
"[label=\"";
971 os << node->
id() <<
"|";
972 for (
auto it = node->
begin(); it != node->
end(); ++it)
976 os <<
"<f" << fvalue <<
"> " << it->first.number <<
"," << it->first.representative
977 <<
"\\n" << it->first.LS <<
"," << it->first.SS;
983 for (
auto it = node->
begin(); it != node->
end(); ++it)
985 print(os, it->second);
986 os <<
"node" <<
id <<
":f" << fvalue <<
" -> node" << it->second->id() <<
";\n";
void computeTraverse(BicoNode *node, ProxySolution< T > *solution)
Recursive computation of the coreset.
void initializeNN()
Initialize nearest neighbour data structure.
Encapsulates an STL random generator.
double getR(int level)
Returns the radius for a given level.
size_t maxNumOfCFs
Maximum coreset size.
void clear()
Delete all nodes.
double getT(int level)
Returns the threshold for a given level.
std::vector< std::vector< T > > proxysets
double kMeansCost(T const ¢er)
1-means clustering cost
void spliceElementTo(typename FeatureList::iterator it, BicoNode *to, typename FeatureList::iterator pos)
std::unique_ptr< WeightModifier< T > > weightModifier
Buffer for WeightModifier.
void erase(typename FeatureList::iterator pos)
std::unique_ptr< DissimilarityMeasure< T > > measure
Buffer for DissimilarityMeasure.
size_t curNumOfCFs
Current coreset size.
int numOfRebuilds
Current number of rebuilding.
void insertIntoNN(typename BicoNode::FeatureList::iterator iteratorElement)
Inserts an element into the nearest neighbour data structure.
void spliceAllTo(BicoNode *to, typename FeatureList::iterator pos)
FeatureList::iterator insert(CFREntry< T > const &feature)
Fast computation of k-means coresets in a data stream.
FeatureList::iterator begin()
std::vector< T > buffer
Buffer phase's buffer.
void rebuild()
Rebuilds the tree.
size_t pairwise_different
Number of unique elements read in buffer phase.
bool operator()(std::pair< T1, T2 > const &left, std::pair< T1, T2 > const &right)
void print(std::ostream &os)
Write the tree as GraphViz source into a stream.
std::vector< std::vector< std::vector< typename BicoNode::FeatureList::iterator > > > buckets
Buckets for nearest neighbour search in first level.
std::vector< double > maxVal
Extreme values used for constructing the nearest neighbour buckets.
int calcBucketNumber(int rnd, double val)
void rebuildFirstLevel(BicoNode *parent, BicoNode *child)
Bico< T > & outer
Parent BICO instance.
BicoNode * root
Root node of BICO's tree.
int nodeIdCounter
Counter for unique BicoNode object ids.
std::vector< std::pair< double, T const * > > projection_buffer
Buffer phase's buffer for projected buffer points.
BicoNode(Bico< T > &outer)
double minDist
Minimum pair distance of two points read in buffer phase.
Abstract base class to modify the weight of weighted objects.
static RandomGenerator getRandomGenerator()
std::vector< double > bucket_radius
Width of buckets.
double project(T point, int i)
Projects a point onto a projection line.
Clustering feature with representation point.
void insert(BicoNode *node, int level, T const &element)
Inserts an element into a BicoNode at a certain level.
Class representing a node in BICO's tree.
size_t k
Number of centers.
std::pair< CFREntry< T >, BicoNode * > FeaturePair
Data structure for proxies.
int objectId
Unique object id.
FeatureList::iterator nearest(T const &element, int level)
virtual Bico< T > & operator<<(T const &element)
Read a point Insert the point into BICO's tree.
Abstract base class for streaming algorithms.
std::vector< std::pair< double, double > > borders
Bucket borders.
std::list< FeaturePair > FeatureList
double optEst
Current estimation of the optimal clustering cost.
Abstract base class for dissimilarity measurement.
size_t dimension
Dimension of the input points.
bool bufferPhase
Buffer phase indicator.
virtual ProxySolution< T > * compute()
Returns a coreset of all point read so far.
Clustering feature tree entry.
std::vector< std::vector< double > > rndprojections
Random projection vectors.
size_t L
Number of projections.
Indicates that a computation entered an invalid configuration state.
FeatureList::iterator end()
void allocateBucket(int bucket, bool left)
Allocates a new bucket.
Bico(size_t dimension, size_t n, size_t k, size_t p, size_t nMax, DissimilarityMeasure< T > *measure, WeightModifier< T > *weightModifier)
Constructs BICO for points of type T T can be an arbitrary type but it has to fulfil the requirements...
void rebuildTraversMerge(BicoNode *node, int level)