first commit
This commit is contained in:
13
third_party/bhtsne/README.md
vendored
Executable file
13
third_party/bhtsne/README.md
vendored
Executable file
@@ -0,0 +1,13 @@
|
||||
implement of t-SNE algorithm, reduce high dims of feature to 2D/3D used for display on screen later.
|
||||
|
||||
header-only style please refer to https://github.com/lvdmaaten/bhtsne
|
||||
|
||||
|
||||
```
|
||||
compile test:
|
||||
g++ tsne_test.cpp -o bh_tsne_test -O2
|
||||
|
||||
and run:
|
||||
./bh_tsne_test
|
||||
|
||||
```
|
||||
545
third_party/bhtsne/sptree.h
vendored
Executable file
545
third_party/bhtsne/sptree.h
vendored
Executable file
@@ -0,0 +1,545 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the Delft University of Technology.
|
||||
* 4. Neither the name of the Delft University of Technology nor the names of
|
||||
* its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
* OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef SPTREE_H
|
||||
#define SPTREE_H
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
class Cell {
|
||||
|
||||
unsigned int dimension;
|
||||
double* corner;
|
||||
double* width;
|
||||
|
||||
|
||||
public:
|
||||
Cell(unsigned int inp_dimension);
|
||||
Cell(unsigned int inp_dimension, double* inp_corner, double* inp_width);
|
||||
~Cell();
|
||||
|
||||
double getCorner(unsigned int d);
|
||||
double getWidth(unsigned int d);
|
||||
void setCorner(unsigned int d, double val);
|
||||
void setWidth(unsigned int d, double val);
|
||||
bool containsPoint(double point[]);
|
||||
};
|
||||
|
||||
|
||||
class SPTree
|
||||
{
|
||||
|
||||
// Fixed constants
|
||||
static const unsigned int QT_NODE_CAPACITY = 1;
|
||||
|
||||
// A buffer we use when doing force computations
|
||||
double* buff;
|
||||
|
||||
// Properties of this node in the tree
|
||||
SPTree* parent;
|
||||
unsigned int dimension;
|
||||
bool is_leaf;
|
||||
unsigned int size;
|
||||
unsigned int cum_size;
|
||||
|
||||
// Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree
|
||||
Cell* boundary;
|
||||
|
||||
// Indices in this space-partitioning tree node, corresponding center-of-mass, and list of all children
|
||||
double* data;
|
||||
double* center_of_mass;
|
||||
unsigned int index[QT_NODE_CAPACITY];
|
||||
|
||||
// Children
|
||||
SPTree** children;
|
||||
unsigned int no_children;
|
||||
|
||||
public:
|
||||
SPTree(unsigned int D, double* inp_data, unsigned int N);
|
||||
SPTree(unsigned int D, double* inp_data, double* inp_corner, double* inp_width);
|
||||
SPTree(unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width);
|
||||
SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width);
|
||||
SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width);
|
||||
~SPTree();
|
||||
void setData(double* inp_data);
|
||||
SPTree* getParent();
|
||||
void construct(Cell boundary);
|
||||
bool insert(unsigned int new_index);
|
||||
void subdivide();
|
||||
bool isCorrect();
|
||||
void rebuildTree();
|
||||
void getAllIndices(unsigned int* indices);
|
||||
unsigned int getDepth();
|
||||
void computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q);
|
||||
void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f);
|
||||
void print();
|
||||
|
||||
private:
|
||||
void init(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width);
|
||||
void fill(unsigned int N);
|
||||
unsigned int getAllIndices(unsigned int* indices, unsigned int loc);
|
||||
bool isChild(unsigned int test_index, unsigned int start, unsigned int end);
|
||||
};
|
||||
|
||||
|
||||
/* copy from original sptree.cpp file */
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the Delft University of Technology.
|
||||
* 4. Neither the name of the Delft University of Technology nor the names of
|
||||
* its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
* OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <cmath>
|
||||
#include "sptree.h"
|
||||
|
||||
|
||||
|
||||
// Constructs cell
|
||||
Cell::Cell(unsigned int inp_dimension) {
|
||||
dimension = inp_dimension;
|
||||
corner = (double*) malloc(dimension * sizeof(double));
|
||||
width = (double*) malloc(dimension * sizeof(double));
|
||||
}
|
||||
|
||||
Cell::Cell(unsigned int inp_dimension, double* inp_corner, double* inp_width) {
|
||||
dimension = inp_dimension;
|
||||
corner = (double*) malloc(dimension * sizeof(double));
|
||||
width = (double*) malloc(dimension * sizeof(double));
|
||||
for(int d = 0; d < dimension; d++) setCorner(d, inp_corner[d]);
|
||||
for(int d = 0; d < dimension; d++) setWidth( d, inp_width[d]);
|
||||
}
|
||||
|
||||
// Destructs cell
|
||||
Cell::~Cell() {
|
||||
free(corner);
|
||||
free(width);
|
||||
}
|
||||
|
||||
double Cell::getCorner(unsigned int d) {
|
||||
return corner[d];
|
||||
}
|
||||
|
||||
double Cell::getWidth(unsigned int d) {
|
||||
return width[d];
|
||||
}
|
||||
|
||||
void Cell::setCorner(unsigned int d, double val) {
|
||||
corner[d] = val;
|
||||
}
|
||||
|
||||
void Cell::setWidth(unsigned int d, double val) {
|
||||
width[d] = val;
|
||||
}
|
||||
|
||||
// Checks whether a point lies in a cell
|
||||
bool Cell::containsPoint(double point[])
|
||||
{
|
||||
for(int d = 0; d < dimension; d++) {
|
||||
if(corner[d] - width[d] > point[d]) return false;
|
||||
if(corner[d] + width[d] < point[d]) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Default constructor for SPTree -- build tree, too!
|
||||
SPTree::SPTree(unsigned int D, double* inp_data, unsigned int N)
|
||||
{
|
||||
|
||||
// Compute mean, width, and height of current map (boundaries of SPTree)
|
||||
int nD = 0;
|
||||
double* mean_Y = (double*) calloc(D, sizeof(double));
|
||||
double* min_Y = (double*) malloc(D * sizeof(double)); for(unsigned int d = 0; d < D; d++) min_Y[d] = DBL_MAX;
|
||||
double* max_Y = (double*) malloc(D * sizeof(double)); for(unsigned int d = 0; d < D; d++) max_Y[d] = -DBL_MAX;
|
||||
for(unsigned int n = 0; n < N; n++) {
|
||||
for(unsigned int d = 0; d < D; d++) {
|
||||
mean_Y[d] += inp_data[n * D + d];
|
||||
if(inp_data[nD + d] < min_Y[d]) min_Y[d] = inp_data[nD + d];
|
||||
if(inp_data[nD + d] > max_Y[d]) max_Y[d] = inp_data[nD + d];
|
||||
}
|
||||
nD += D;
|
||||
}
|
||||
for(int d = 0; d < D; d++) mean_Y[d] /= (double) N;
|
||||
|
||||
// Construct SPTree
|
||||
double* width = (double*) malloc(D * sizeof(double));
|
||||
for(int d = 0; d < D; d++) width[d] = fmax(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5;
|
||||
init(NULL, D, inp_data, mean_Y, width);
|
||||
fill(N);
|
||||
|
||||
// Clean up memory
|
||||
free(mean_Y);
|
||||
free(max_Y);
|
||||
free(min_Y);
|
||||
free(width);
|
||||
}
|
||||
|
||||
|
||||
// Constructor for SPTree with particular size and parent -- build the tree, too!
|
||||
SPTree::SPTree(unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width)
|
||||
{
|
||||
init(NULL, D, inp_data, inp_corner, inp_width);
|
||||
fill(N);
|
||||
}
|
||||
|
||||
|
||||
// Constructor for SPTree with particular size (do not fill the tree)
|
||||
SPTree::SPTree(unsigned int D, double* inp_data, double* inp_corner, double* inp_width)
|
||||
{
|
||||
init(NULL, D, inp_data, inp_corner, inp_width);
|
||||
}
|
||||
|
||||
|
||||
// Constructor for SPTree with particular size and parent (do not fill tree)
|
||||
SPTree::SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width) {
|
||||
init(inp_parent, D, inp_data, inp_corner, inp_width);
|
||||
}
|
||||
|
||||
|
||||
// Constructor for SPTree with particular size and parent -- build the tree, too!
|
||||
SPTree::SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width)
|
||||
{
|
||||
init(inp_parent, D, inp_data, inp_corner, inp_width);
|
||||
fill(N);
|
||||
}
|
||||
|
||||
|
||||
// Main initialization function
|
||||
void SPTree::init(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width)
|
||||
{
|
||||
parent = inp_parent;
|
||||
dimension = D;
|
||||
no_children = 2;
|
||||
for(unsigned int d = 1; d < D; d++) no_children *= 2;
|
||||
data = inp_data;
|
||||
is_leaf = true;
|
||||
size = 0;
|
||||
cum_size = 0;
|
||||
|
||||
boundary = new Cell(dimension);
|
||||
for(unsigned int d = 0; d < D; d++) boundary->setCorner(d, inp_corner[d]);
|
||||
for(unsigned int d = 0; d < D; d++) boundary->setWidth( d, inp_width[d]);
|
||||
|
||||
children = (SPTree**) malloc(no_children * sizeof(SPTree*));
|
||||
for(unsigned int i = 0; i < no_children; i++) children[i] = NULL;
|
||||
|
||||
center_of_mass = (double*) malloc(D * sizeof(double));
|
||||
for(unsigned int d = 0; d < D; d++) center_of_mass[d] = .0;
|
||||
|
||||
buff = (double*) malloc(D * sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
// Destructor for SPTree
|
||||
SPTree::~SPTree()
|
||||
{
|
||||
for(unsigned int i = 0; i < no_children; i++) {
|
||||
if(children[i] != NULL) delete children[i];
|
||||
}
|
||||
free(children);
|
||||
free(center_of_mass);
|
||||
free(buff);
|
||||
delete boundary;
|
||||
}
|
||||
|
||||
|
||||
// Update the data underlying this tree
|
||||
void SPTree::setData(double* inp_data)
|
||||
{
|
||||
data = inp_data;
|
||||
}
|
||||
|
||||
|
||||
// Get the parent of the current tree
|
||||
SPTree* SPTree::getParent()
|
||||
{
|
||||
return parent;
|
||||
}
|
||||
|
||||
|
||||
// Insert a point into the SPTree
|
||||
bool SPTree::insert(unsigned int new_index)
|
||||
{
|
||||
// Ignore objects which do not belong in this quad tree
|
||||
double* point = data + new_index * dimension;
|
||||
if(!boundary->containsPoint(point))
|
||||
return false;
|
||||
|
||||
// Online update of cumulative size and center-of-mass
|
||||
cum_size++;
|
||||
double mult1 = (double) (cum_size - 1) / (double) cum_size;
|
||||
double mult2 = 1.0 / (double) cum_size;
|
||||
for(unsigned int d = 0; d < dimension; d++) center_of_mass[d] *= mult1;
|
||||
for(unsigned int d = 0; d < dimension; d++) center_of_mass[d] += mult2 * point[d];
|
||||
|
||||
// If there is space in this quad tree and it is a leaf, add the object here
|
||||
if(is_leaf && size < QT_NODE_CAPACITY) {
|
||||
index[size] = new_index;
|
||||
size++;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Don't add duplicates for now (this is not very nice)
|
||||
bool any_duplicate = false;
|
||||
for(unsigned int n = 0; n < size; n++) {
|
||||
bool duplicate = true;
|
||||
for(unsigned int d = 0; d < dimension; d++) {
|
||||
if(point[d] != data[index[n] * dimension + d]) { duplicate = false; break; }
|
||||
}
|
||||
any_duplicate = any_duplicate | duplicate;
|
||||
}
|
||||
if(any_duplicate) return true;
|
||||
|
||||
// Otherwise, we need to subdivide the current cell
|
||||
if(is_leaf) subdivide();
|
||||
|
||||
// Find out where the point can be inserted
|
||||
for(unsigned int i = 0; i < no_children; i++) {
|
||||
if(children[i]->insert(new_index)) return true;
|
||||
}
|
||||
|
||||
// Otherwise, the point cannot be inserted (this should never happen)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Create four children which fully divide this cell into four quads of equal area
|
||||
void SPTree::subdivide() {
|
||||
|
||||
// Create new children
|
||||
double* new_corner = (double*) malloc(dimension * sizeof(double));
|
||||
double* new_width = (double*) malloc(dimension * sizeof(double));
|
||||
for(unsigned int i = 0; i < no_children; i++) {
|
||||
unsigned int div = 1;
|
||||
for(unsigned int d = 0; d < dimension; d++) {
|
||||
new_width[d] = .5 * boundary->getWidth(d);
|
||||
if((i / div) % 2 == 1) new_corner[d] = boundary->getCorner(d) - .5 * boundary->getWidth(d);
|
||||
else new_corner[d] = boundary->getCorner(d) + .5 * boundary->getWidth(d);
|
||||
div *= 2;
|
||||
}
|
||||
children[i] = new SPTree(this, dimension, data, new_corner, new_width);
|
||||
}
|
||||
free(new_corner);
|
||||
free(new_width);
|
||||
|
||||
// Move existing points to correct children
|
||||
for(unsigned int i = 0; i < size; i++) {
|
||||
bool success = false;
|
||||
for(unsigned int j = 0; j < no_children; j++) {
|
||||
if(!success) success = children[j]->insert(index[i]);
|
||||
}
|
||||
index[i] = -1;
|
||||
}
|
||||
|
||||
// Empty parent node
|
||||
size = 0;
|
||||
is_leaf = false;
|
||||
}
|
||||
|
||||
|
||||
// Build SPTree on dataset
|
||||
void SPTree::fill(unsigned int N)
|
||||
{
|
||||
for(unsigned int i = 0; i < N; i++) insert(i);
|
||||
}
|
||||
|
||||
|
||||
// Checks whether the specified tree is correct
|
||||
bool SPTree::isCorrect()
|
||||
{
|
||||
for(unsigned int n = 0; n < size; n++) {
|
||||
double* point = data + index[n] * dimension;
|
||||
if(!boundary->containsPoint(point)) return false;
|
||||
}
|
||||
if(!is_leaf) {
|
||||
bool correct = true;
|
||||
for(int i = 0; i < no_children; i++) correct = correct && children[i]->isCorrect();
|
||||
return correct;
|
||||
}
|
||||
else return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Build a list of all indices in SPTree
|
||||
void SPTree::getAllIndices(unsigned int* indices)
|
||||
{
|
||||
getAllIndices(indices, 0);
|
||||
}
|
||||
|
||||
|
||||
// Build a list of all indices in SPTree
|
||||
unsigned int SPTree::getAllIndices(unsigned int* indices, unsigned int loc)
|
||||
{
|
||||
|
||||
// Gather indices in current quadrant
|
||||
for(unsigned int i = 0; i < size; i++) indices[loc + i] = index[i];
|
||||
loc += size;
|
||||
|
||||
// Gather indices in children
|
||||
if(!is_leaf) {
|
||||
for(int i = 0; i < no_children; i++) loc = children[i]->getAllIndices(indices, loc);
|
||||
}
|
||||
return loc;
|
||||
}
|
||||
|
||||
|
||||
unsigned int SPTree::getDepth() {
|
||||
if(is_leaf) return 1;
|
||||
int depth = 0;
|
||||
for(unsigned int i = 0; i < no_children; i++) depth = fmax(depth, children[i]->getDepth());
|
||||
return 1 + depth;
|
||||
}
|
||||
|
||||
|
||||
// Compute non-edge forces using Barnes-Hut algorithm
|
||||
void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q)
|
||||
{
|
||||
|
||||
// Make sure that we spend no time on empty nodes or self-interactions
|
||||
if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return;
|
||||
|
||||
// Compute distance between point and center-of-mass
|
||||
double D = .0;
|
||||
unsigned int ind = point_index * dimension;
|
||||
for(unsigned int d = 0; d < dimension; d++) buff[d] = data[ind + d] - center_of_mass[d];
|
||||
for(unsigned int d = 0; d < dimension; d++) D += buff[d] * buff[d];
|
||||
|
||||
// Check whether we can use this node as a "summary"
|
||||
double max_width = 0.0;
|
||||
double cur_width;
|
||||
for(unsigned int d = 0; d < dimension; d++) {
|
||||
cur_width = boundary->getWidth(d);
|
||||
max_width = (max_width > cur_width) ? max_width : cur_width;
|
||||
}
|
||||
if(is_leaf || max_width / sqrt(D) < theta) {
|
||||
|
||||
// Compute and add t-SNE force between point and current node
|
||||
D = 1.0 / (1.0 + D);
|
||||
double mult = cum_size * D;
|
||||
*sum_Q += mult;
|
||||
mult *= D;
|
||||
for(unsigned int d = 0; d < dimension; d++) neg_f[d] += mult * buff[d];
|
||||
}
|
||||
else {
|
||||
|
||||
// Recursively apply Barnes-Hut to children
|
||||
for(unsigned int i = 0; i < no_children; i++) children[i]->computeNonEdgeForces(point_index, theta, neg_f, sum_Q);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Computes edge forces
|
||||
void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f)
|
||||
{
|
||||
|
||||
// Loop over all edges in the graph
|
||||
unsigned int ind1 = 0;
|
||||
unsigned int ind2 = 0;
|
||||
double D;
|
||||
for(unsigned int n = 0; n < N; n++) {
|
||||
for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
|
||||
|
||||
// Compute pairwise distance and Q-value
|
||||
D = 1.0;
|
||||
ind2 = col_P[i] * dimension;
|
||||
for(unsigned int d = 0; d < dimension; d++) buff[d] = data[ind1 + d] - data[ind2 + d];
|
||||
for(unsigned int d = 0; d < dimension; d++) D += buff[d] * buff[d];
|
||||
D = val_P[i] / D;
|
||||
|
||||
// Sum positive force
|
||||
for(unsigned int d = 0; d < dimension; d++) pos_f[ind1 + d] += D * buff[d];
|
||||
}
|
||||
ind1 += dimension;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Print out tree
|
||||
void SPTree::print()
|
||||
{
|
||||
if(cum_size == 0) {
|
||||
printf("Empty node\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if(is_leaf) {
|
||||
printf("Leaf node; data = [");
|
||||
for(int i = 0; i < size; i++) {
|
||||
double* point = data + index[i] * dimension;
|
||||
for(int d = 0; d < dimension; d++) printf("%f, ", point[d]);
|
||||
printf(" (index = %d)", index[i]);
|
||||
if(i < size - 1) printf("\n");
|
||||
else printf("]\n");
|
||||
}
|
||||
}
|
||||
else {
|
||||
printf("Intersection node with center-of-mass = [");
|
||||
for(int d = 0; d < dimension; d++) printf("%f, ", center_of_mass[d]);
|
||||
printf("]; children are:\n");
|
||||
for(int i = 0; i < no_children; i++) children[i]->print();
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
774
third_party/bhtsne/tsne.h
vendored
Executable file
774
third_party/bhtsne/tsne.h
vendored
Executable file
@@ -0,0 +1,774 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the Delft University of Technology.
|
||||
* 4. Neither the name of the Delft University of Technology nor the names of
|
||||
* its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
* OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef TSNE_H
|
||||
#define TSNE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
namespace TSNE {
|
||||
#endif
|
||||
void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed,
|
||||
bool skip_random_init, int max_iter, int stop_lying_iter, int mom_switch_iter);
|
||||
bool load_data(double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter);
|
||||
void save_data(double* data, int* landmarks, double* costs, int n, int d);
|
||||
#ifdef __cplusplus
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* copy from original tsne.cpp file */
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the Delft University of Technology.
|
||||
* 4. Neither the name of the Delft University of Technology nor the names of
|
||||
* its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
* OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include "vptree.h"
|
||||
#include "sptree.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
static double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); }
|
||||
|
||||
static void zeroMean(double* X, int N, int D);
|
||||
static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity);
|
||||
static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K);
|
||||
static double randn();
|
||||
static void computeExactGradient(double* P, double* Y, int N, int D, double* dC);
|
||||
static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta);
|
||||
static double evaluateError(double* P, double* Y, int N, int D);
|
||||
static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta);
|
||||
static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD);
|
||||
static void symmetrizeMatrix(unsigned int** row_P, unsigned int** col_P, double** val_P, int N);
|
||||
|
||||
// Perform t-SNE
|
||||
void TSNE::run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed,
|
||||
bool skip_random_init, int max_iter, int stop_lying_iter, int mom_switch_iter) {
|
||||
|
||||
// Set random seed
|
||||
if (skip_random_init != true) {
|
||||
if(rand_seed >= 0) {
|
||||
printf("Using random seed: %d\n", rand_seed);
|
||||
srand((unsigned int) rand_seed);
|
||||
} else {
|
||||
printf("Using current time as random seed...\n");
|
||||
srand(time(NULL));
|
||||
}
|
||||
}
|
||||
|
||||
// Determine whether we are using an exact algorithm
|
||||
if(N - 1 < 3 * perplexity) { printf("Perplexity too large for the number of data points!\n"); exit(1); }
|
||||
printf("Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta);
|
||||
bool exact = (theta == .0) ? true : false;
|
||||
|
||||
// Set learning parameters
|
||||
float total_time = .0;
|
||||
clock_t start, end;
|
||||
double momentum = .5, final_momentum = .8;
|
||||
double eta = 200.0;
|
||||
|
||||
// Allocate some memory
|
||||
double* dY = (double*) malloc(N * no_dims * sizeof(double));
|
||||
double* uY = (double*) malloc(N * no_dims * sizeof(double));
|
||||
double* gains = (double*) malloc(N * no_dims * sizeof(double));
|
||||
if(dY == NULL || uY == NULL || gains == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
for(int i = 0; i < N * no_dims; i++) uY[i] = .0;
|
||||
for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0;
|
||||
|
||||
// Normalize input data (to prevent numerical problems)
|
||||
printf("Computing input similarities...\n");
|
||||
start = clock();
|
||||
zeroMean(X, N, D);
|
||||
double max_X = .0;
|
||||
for(int i = 0; i < N * D; i++) {
|
||||
if(fabs(X[i]) > max_X) max_X = fabs(X[i]);
|
||||
}
|
||||
for(int i = 0; i < N * D; i++) X[i] /= max_X;
|
||||
|
||||
// Compute input similarities for exact t-SNE
|
||||
double* P; unsigned int* row_P; unsigned int* col_P; double* val_P;
|
||||
if(exact) {
|
||||
|
||||
// Compute similarities
|
||||
printf("Exact?");
|
||||
P = (double*) malloc(N * N * sizeof(double));
|
||||
if(P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
computeGaussianPerplexity(X, N, D, P, perplexity);
|
||||
|
||||
// Symmetrize input similarities
|
||||
printf("Symmetrizing...\n");
|
||||
int nN = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
int mN = (n + 1) * N;
|
||||
for(int m = n + 1; m < N; m++) {
|
||||
P[nN + m] += P[mN + n];
|
||||
P[mN + n] = P[nN + m];
|
||||
mN += N;
|
||||
}
|
||||
nN += N;
|
||||
}
|
||||
double sum_P = .0;
|
||||
for(int i = 0; i < N * N; i++) sum_P += P[i];
|
||||
for(int i = 0; i < N * N; i++) P[i] /= sum_P;
|
||||
}
|
||||
|
||||
// Compute input similarities for approximate t-SNE
|
||||
else {
|
||||
|
||||
// Compute asymmetric pairwise input similarities
|
||||
computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, (int) (3 * perplexity));
|
||||
|
||||
// Symmetrize input similarities
|
||||
symmetrizeMatrix(&row_P, &col_P, &val_P, N);
|
||||
double sum_P = .0;
|
||||
for(int i = 0; i < row_P[N]; i++) sum_P += val_P[i];
|
||||
for(int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P;
|
||||
}
|
||||
end = clock();
|
||||
|
||||
// Lie about the P-values
|
||||
if(exact) { for(int i = 0; i < N * N; i++) P[i] *= 12.0; }
|
||||
else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; }
|
||||
|
||||
// Initialize solution (randomly)
|
||||
if (skip_random_init != true) {
|
||||
for(int i = 0; i < N * no_dims; i++) Y[i] = randn() * .0001;
|
||||
}
|
||||
|
||||
// Perform main training loop
|
||||
if(exact) printf("Input similarities computed in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC);
|
||||
else printf("Input similarities computed in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N));
|
||||
start = clock();
|
||||
|
||||
for(int iter = 0; iter < max_iter; iter++) {
|
||||
|
||||
// Compute (approximate) gradient
|
||||
if(exact) computeExactGradient(P, Y, N, no_dims, dY);
|
||||
else computeGradient(row_P, col_P, val_P, Y, N, no_dims, dY, theta);
|
||||
|
||||
// Update gains
|
||||
for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8);
|
||||
for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01;
|
||||
|
||||
// Perform gradient update (with momentum and gains)
|
||||
for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i];
|
||||
for(int i = 0; i < N * no_dims; i++) Y[i] = Y[i] + uY[i];
|
||||
|
||||
// Make solution zero-mean
|
||||
zeroMean(Y, N, no_dims);
|
||||
|
||||
// Stop lying about the P-values after a while, and switch momentum
|
||||
if(iter == stop_lying_iter) {
|
||||
if(exact) { for(int i = 0; i < N * N; i++) P[i] /= 12.0; }
|
||||
else { for(int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; }
|
||||
}
|
||||
if(iter == mom_switch_iter) momentum = final_momentum;
|
||||
|
||||
// Print out progress
|
||||
if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) {
|
||||
end = clock();
|
||||
double C = .0;
|
||||
if(exact) C = evaluateError(P, Y, N, no_dims);
|
||||
else C = evaluateError(row_P, col_P, val_P, Y, N, no_dims, theta); // doing approximate computation here!
|
||||
if(iter == 0)
|
||||
printf("Iteration %d: error is %f\n", iter + 1, C);
|
||||
else {
|
||||
total_time += (float) (end - start) / CLOCKS_PER_SEC;
|
||||
printf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter, C, (float) (end - start) / CLOCKS_PER_SEC);
|
||||
}
|
||||
start = clock();
|
||||
}
|
||||
}
|
||||
end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC;
|
||||
|
||||
// Clean up memory
|
||||
free(dY);
|
||||
free(uY);
|
||||
free(gains);
|
||||
if(exact) free(P);
|
||||
else {
|
||||
free(row_P); row_P = NULL;
|
||||
free(col_P); col_P = NULL;
|
||||
free(val_P); val_P = NULL;
|
||||
}
|
||||
printf("Fitting performed in %4.2f seconds.\n", total_time);
|
||||
}
|
||||
|
||||
|
||||
// Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm)
|
||||
static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta)
|
||||
{
|
||||
|
||||
// Construct space-partitioning tree on current map
|
||||
SPTree* tree = new SPTree(D, Y, N);
|
||||
|
||||
// Compute all terms required for t-SNE gradient
|
||||
double sum_Q = .0;
|
||||
double* pos_f = (double*) calloc(N * D, sizeof(double));
|
||||
double* neg_f = (double*) calloc(N * D, sizeof(double));
|
||||
if(pos_f == NULL || neg_f == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f);
|
||||
for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q);
|
||||
|
||||
// Compute final t-SNE gradient
|
||||
for(int i = 0; i < N * D; i++) {
|
||||
dC[i] = pos_f[i] - (neg_f[i] / sum_Q);
|
||||
}
|
||||
free(pos_f);
|
||||
free(neg_f);
|
||||
delete tree;
|
||||
}
|
||||
|
||||
// Compute gradient of the t-SNE cost function (exact)
|
||||
static void computeExactGradient(double* P, double* Y, int N, int D, double* dC) {
|
||||
|
||||
// Make sure the current gradient contains zeros
|
||||
for(int i = 0; i < N * D; i++) dC[i] = 0.0;
|
||||
|
||||
// Compute the squared Euclidean distance matrix
|
||||
double* DD = (double*) malloc(N * N * sizeof(double));
|
||||
if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
computeSquaredEuclideanDistance(Y, N, D, DD);
|
||||
|
||||
// Compute Q-matrix and normalization sum
|
||||
double* Q = (double*) malloc(N * N * sizeof(double));
|
||||
if(Q == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
double sum_Q = .0;
|
||||
int nN = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(int m = 0; m < N; m++) {
|
||||
if(n != m) {
|
||||
Q[nN + m] = 1 / (1 + DD[nN + m]);
|
||||
sum_Q += Q[nN + m];
|
||||
}
|
||||
}
|
||||
nN += N;
|
||||
}
|
||||
|
||||
// Perform the computation of the gradient
|
||||
nN = 0;
|
||||
int nD = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
int mD = 0;
|
||||
for(int m = 0; m < N; m++) {
|
||||
if(n != m) {
|
||||
double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m];
|
||||
for(int d = 0; d < D; d++) {
|
||||
dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult;
|
||||
}
|
||||
}
|
||||
mD += D;
|
||||
}
|
||||
nN += N;
|
||||
nD += D;
|
||||
}
|
||||
|
||||
// Free memory
|
||||
free(DD); DD = NULL;
|
||||
free(Q); Q = NULL;
|
||||
}
|
||||
|
||||
|
||||
// Evaluate t-SNE cost function (exactly)
|
||||
static double evaluateError(double* P, double* Y, int N, int D) {
|
||||
|
||||
// Compute the squared Euclidean distance matrix
|
||||
double* DD = (double*) malloc(N * N * sizeof(double));
|
||||
double* Q = (double*) malloc(N * N * sizeof(double));
|
||||
if(DD == NULL || Q == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
computeSquaredEuclideanDistance(Y, N, D, DD);
|
||||
|
||||
// Compute Q-matrix and normalization sum
|
||||
int nN = 0;
|
||||
double sum_Q = DBL_MIN;
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(int m = 0; m < N; m++) {
|
||||
if(n != m) {
|
||||
Q[nN + m] = 1 / (1 + DD[nN + m]);
|
||||
sum_Q += Q[nN + m];
|
||||
}
|
||||
else Q[nN + m] = DBL_MIN;
|
||||
}
|
||||
nN += N;
|
||||
}
|
||||
for(int i = 0; i < N * N; i++) Q[i] /= sum_Q;
|
||||
|
||||
// Sum t-SNE error
|
||||
double C = .0;
|
||||
for(int n = 0; n < N * N; n++) {
|
||||
C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN));
|
||||
}
|
||||
|
||||
// Clean up memory
|
||||
free(DD);
|
||||
free(Q);
|
||||
return C;
|
||||
}
|
||||
|
||||
// Evaluate t-SNE cost function (approximately)
|
||||
static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta)
|
||||
{
|
||||
|
||||
// Get estimate of normalization term
|
||||
SPTree* tree = new SPTree(D, Y, N);
|
||||
double* buff = (double*) calloc(D, sizeof(double));
|
||||
double sum_Q = .0;
|
||||
for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q);
|
||||
|
||||
// Loop over all edges to compute t-SNE error
|
||||
int ind1, ind2;
|
||||
double C = .0, Q;
|
||||
for(int n = 0; n < N; n++) {
|
||||
ind1 = n * D;
|
||||
for(int i = row_P[n]; i < row_P[n + 1]; i++) {
|
||||
Q = .0;
|
||||
ind2 = col_P[i] * D;
|
||||
for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d];
|
||||
for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
|
||||
for(int d = 0; d < D; d++) Q += buff[d] * buff[d];
|
||||
Q = (1.0 / (1.0 + Q)) / sum_Q;
|
||||
C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up memory
|
||||
free(buff);
|
||||
delete tree;
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// Compute input similarities with a fixed perplexity
|
||||
static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity) {
|
||||
|
||||
// Compute the squared Euclidean distance matrix
|
||||
double* DD = (double*) malloc(N * N * sizeof(double));
|
||||
if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
computeSquaredEuclideanDistance(X, N, D, DD);
|
||||
|
||||
// Compute the Gaussian kernel row by row
|
||||
int nN = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
|
||||
// Initialize some variables
|
||||
bool found = false;
|
||||
double beta = 1.0;
|
||||
double min_beta = -DBL_MAX;
|
||||
double max_beta = DBL_MAX;
|
||||
double tol = 1e-5;
|
||||
double sum_P;
|
||||
|
||||
// Iterate until we found a good perplexity
|
||||
int iter = 0;
|
||||
while(!found && iter < 200) {
|
||||
|
||||
// Compute Gaussian kernel row
|
||||
for(int m = 0; m < N; m++) P[nN + m] = exp(-beta * DD[nN + m]);
|
||||
P[nN + n] = DBL_MIN;
|
||||
|
||||
// Compute entropy of current row
|
||||
sum_P = DBL_MIN;
|
||||
for(int m = 0; m < N; m++) sum_P += P[nN + m];
|
||||
double H = 0.0;
|
||||
for(int m = 0; m < N; m++) H += beta * (DD[nN + m] * P[nN + m]);
|
||||
H = (H / sum_P) + log(sum_P);
|
||||
|
||||
// Evaluate whether the entropy is within the tolerance level
|
||||
double Hdiff = H - log(perplexity);
|
||||
if(Hdiff < tol && -Hdiff < tol) {
|
||||
found = true;
|
||||
}
|
||||
else {
|
||||
if(Hdiff > 0) {
|
||||
min_beta = beta;
|
||||
if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
|
||||
beta *= 2.0;
|
||||
else
|
||||
beta = (beta + max_beta) / 2.0;
|
||||
}
|
||||
else {
|
||||
max_beta = beta;
|
||||
if(min_beta == -DBL_MAX || min_beta == DBL_MAX){
|
||||
if (beta < 0) {
|
||||
beta *= 2;
|
||||
} else {
|
||||
beta = beta <= 1.0 ? -0.5 : beta / 2.0;
|
||||
}
|
||||
} else {
|
||||
beta = (beta + min_beta) / 2.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update iteration counter
|
||||
iter++;
|
||||
}
|
||||
|
||||
// Row normalize P
|
||||
for(int m = 0; m < N; m++) P[nN + m] /= sum_P;
|
||||
nN += N;
|
||||
}
|
||||
|
||||
// Clean up memory
|
||||
free(DD); DD = NULL;
|
||||
}
|
||||
|
||||
|
||||
// Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free)
|
||||
static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K) {
|
||||
|
||||
if(perplexity > K) printf("Perplexity should be lower than K!\n");
|
||||
|
||||
// Allocate the memory we need
|
||||
*_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int));
|
||||
*_col_P = (unsigned int*) calloc(N * K, sizeof(unsigned int));
|
||||
*_val_P = (double*) calloc(N * K, sizeof(double));
|
||||
if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
unsigned int* row_P = *_row_P;
|
||||
unsigned int* col_P = *_col_P;
|
||||
double* val_P = *_val_P;
|
||||
double* cur_P = (double*) malloc((N - 1) * sizeof(double));
|
||||
if(cur_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
row_P[0] = 0;
|
||||
for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + (unsigned int) K;
|
||||
|
||||
// Build ball tree on data set
|
||||
VpTree<DataPoint, euclidean_distance>* tree = new VpTree<DataPoint, euclidean_distance>();
|
||||
vector<DataPoint> obj_X(N, DataPoint(D, -1, X));
|
||||
for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D);
|
||||
tree->create(obj_X);
|
||||
|
||||
// Loop over all points to find nearest neighbors
|
||||
printf("Building tree...\n");
|
||||
vector<DataPoint> indices;
|
||||
vector<double> distances;
|
||||
for(int n = 0; n < N; n++) {
|
||||
|
||||
if(n % 10000 == 0) printf(" - point %d of %d\n", n, N);
|
||||
|
||||
// Find nearest neighbors
|
||||
indices.clear();
|
||||
distances.clear();
|
||||
tree->search(obj_X[n], K + 1, &indices, &distances);
|
||||
|
||||
// Initialize some variables for binary search
|
||||
bool found = false;
|
||||
double beta = 1.0;
|
||||
double min_beta = -DBL_MAX;
|
||||
double max_beta = DBL_MAX;
|
||||
double tol = 1e-5;
|
||||
|
||||
// Iterate until we found a good perplexity
|
||||
int iter = 0; double sum_P;
|
||||
while(!found && iter < 200) {
|
||||
|
||||
// Compute Gaussian kernel row
|
||||
for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]);
|
||||
|
||||
// Compute entropy of current row
|
||||
sum_P = DBL_MIN;
|
||||
for(int m = 0; m < K; m++) sum_P += cur_P[m];
|
||||
double H = .0;
|
||||
for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]);
|
||||
H = (H / sum_P) + log(sum_P);
|
||||
|
||||
// Evaluate whether the entropy is within the tolerance level
|
||||
double Hdiff = H - log(perplexity);
|
||||
if(Hdiff < tol && -Hdiff < tol) {
|
||||
found = true;
|
||||
}
|
||||
else {
|
||||
if(Hdiff > 0) {
|
||||
min_beta = beta;
|
||||
if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
|
||||
beta *= 2.0;
|
||||
else
|
||||
beta = (beta + max_beta) / 2.0;
|
||||
}
|
||||
else {
|
||||
max_beta = beta;
|
||||
if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
|
||||
beta /= 2.0;
|
||||
else
|
||||
beta = (beta + min_beta) / 2.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Update iteration counter
|
||||
iter++;
|
||||
}
|
||||
|
||||
// Row-normalize current row of P and store in matrix
|
||||
for(unsigned int m = 0; m < K; m++) cur_P[m] /= sum_P;
|
||||
for(unsigned int m = 0; m < K; m++) {
|
||||
col_P[row_P[n] + m] = (unsigned int) indices[m + 1].index();
|
||||
val_P[row_P[n] + m] = cur_P[m];
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up memory
|
||||
obj_X.clear();
|
||||
free(cur_P);
|
||||
delete tree;
|
||||
}
|
||||
|
||||
|
||||
// Symmetrizes a sparse matrix
|
||||
static void symmetrizeMatrix(unsigned int** _row_P, unsigned int** _col_P, double** _val_P, int N) {
|
||||
|
||||
// Get sparse matrix
|
||||
unsigned int* row_P = *_row_P;
|
||||
unsigned int* col_P = *_col_P;
|
||||
double* val_P = *_val_P;
|
||||
|
||||
// Count number of elements and row counts of symmetric matrix
|
||||
int* row_counts = (int*) calloc(N, sizeof(int));
|
||||
if(row_counts == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(int i = row_P[n]; i < row_P[n + 1]; i++) {
|
||||
|
||||
// Check whether element (col_P[i], n) is present
|
||||
bool present = false;
|
||||
for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
|
||||
if(col_P[m] == n) present = true;
|
||||
}
|
||||
if(present) row_counts[n]++;
|
||||
else {
|
||||
row_counts[n]++;
|
||||
row_counts[col_P[i]]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
int no_elem = 0;
|
||||
for(int n = 0; n < N; n++) no_elem += row_counts[n];
|
||||
|
||||
// Allocate memory for symmetrized matrix
|
||||
unsigned int* sym_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int));
|
||||
unsigned int* sym_col_P = (unsigned int*) malloc(no_elem * sizeof(unsigned int));
|
||||
double* sym_val_P = (double*) malloc(no_elem * sizeof(double));
|
||||
if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
|
||||
// Construct new row indices for symmetric matrix
|
||||
sym_row_P[0] = 0;
|
||||
for(int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + (unsigned int) row_counts[n];
|
||||
|
||||
// Fill the result matrix
|
||||
int* offset = (int*) calloc(N, sizeof(int));
|
||||
if(offset == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i])
|
||||
|
||||
// Check whether element (col_P[i], n) is present
|
||||
bool present = false;
|
||||
for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
|
||||
if(col_P[m] == n) {
|
||||
present = true;
|
||||
if(n <= col_P[i]) { // make sure we do not add elements twice
|
||||
sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
|
||||
sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
|
||||
sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m];
|
||||
sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If (col_P[i], n) is not present, there is no addition involved
|
||||
if(!present) {
|
||||
sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
|
||||
sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
|
||||
sym_val_P[sym_row_P[n] + offset[n]] = val_P[i];
|
||||
sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i];
|
||||
}
|
||||
|
||||
// Update offsets
|
||||
if(!present || (present && n <= col_P[i])) {
|
||||
offset[n]++;
|
||||
if(col_P[i] != n) offset[col_P[i]]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Divide the result by two
|
||||
for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0;
|
||||
|
||||
// Return symmetrized matrices
|
||||
free(*_row_P); *_row_P = sym_row_P;
|
||||
free(*_col_P); *_col_P = sym_col_P;
|
||||
free(*_val_P); *_val_P = sym_val_P;
|
||||
|
||||
// Free up some memery
|
||||
free(offset); offset = NULL;
|
||||
free(row_counts); row_counts = NULL;
|
||||
}
|
||||
|
||||
// Compute squared Euclidean distance matrix
|
||||
static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) {
|
||||
const double* XnD = X;
|
||||
for(int n = 0; n < N; ++n, XnD += D) {
|
||||
const double* XmD = XnD + D;
|
||||
double* curr_elem = &DD[n*N + n];
|
||||
*curr_elem = 0.0;
|
||||
double* curr_elem_sym = curr_elem + N;
|
||||
for(int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) {
|
||||
*(++curr_elem) = 0.0;
|
||||
for(int d = 0; d < D; ++d) {
|
||||
*curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]);
|
||||
}
|
||||
*curr_elem_sym = *curr_elem;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Makes data zero-mean
|
||||
static void zeroMean(double* X, int N, int D) {
|
||||
|
||||
// Compute data mean
|
||||
double* mean = (double*) calloc(D, sizeof(double));
|
||||
if(mean == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
int nD = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(int d = 0; d < D; d++) {
|
||||
mean[d] += X[nD + d];
|
||||
}
|
||||
nD += D;
|
||||
}
|
||||
for(int d = 0; d < D; d++) {
|
||||
mean[d] /= (double) N;
|
||||
}
|
||||
|
||||
// Subtract data mean
|
||||
nD = 0;
|
||||
for(int n = 0; n < N; n++) {
|
||||
for(int d = 0; d < D; d++) {
|
||||
X[nD + d] -= mean[d];
|
||||
}
|
||||
nD += D;
|
||||
}
|
||||
free(mean); mean = NULL;
|
||||
}
|
||||
|
||||
|
||||
// Generates a Gaussian random number
|
||||
static double randn() {
|
||||
double x, y, radius;
|
||||
do {
|
||||
x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1;
|
||||
y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1;
|
||||
radius = (x * x) + (y * y);
|
||||
} while((radius >= 1.0) || (radius == 0.0));
|
||||
radius = sqrt(-2 * log(radius) / radius);
|
||||
x *= radius;
|
||||
y *= radius;
|
||||
return x;
|
||||
}
|
||||
|
||||
// Function that loads data from a t-SNE file
|
||||
// Note: this function does a malloc that should be freed elsewhere
|
||||
bool TSNE::load_data(double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter) {
|
||||
|
||||
// Open file, read first 2 integers, allocate memory, and read the data
|
||||
FILE *h;
|
||||
if((h = fopen("data.dat", "r+b")) == NULL) {
|
||||
printf("Error: could not open data file.\n");
|
||||
return false;
|
||||
}
|
||||
fread(n, sizeof(int), 1, h); // number of datapoints
|
||||
fread(d, sizeof(int), 1, h); // original dimensionality
|
||||
fread(theta, sizeof(double), 1, h); // gradient accuracy
|
||||
fread(perplexity, sizeof(double), 1, h); // perplexity
|
||||
fread(no_dims, sizeof(int), 1, h); // output dimensionality
|
||||
fread(max_iter, sizeof(int),1,h); // maximum number of iterations
|
||||
*data = (double*) malloc(*d * *n * sizeof(double));
|
||||
if(*data == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
fread(*data, sizeof(double), *n * *d, h); // the data
|
||||
if(!feof(h)) fread(rand_seed, sizeof(int), 1, h); // random seed
|
||||
fclose(h);
|
||||
printf("Read the %i x %i data matrix successfully!\n", *n, *d);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function that saves map to a t-SNE file
|
||||
void TSNE::save_data(double* data, int* landmarks, double* costs, int n, int d) {
|
||||
|
||||
// Open file, write first 2 integers and then the data
|
||||
FILE *h;
|
||||
if((h = fopen("result.dat", "w+b")) == NULL) {
|
||||
printf("Error: could not open data file.\n");
|
||||
return;
|
||||
}
|
||||
fwrite(&n, sizeof(int), 1, h);
|
||||
fwrite(&d, sizeof(int), 1, h);
|
||||
fwrite(data, sizeof(double), n * d, h);
|
||||
fwrite(landmarks, sizeof(int), n, h);
|
||||
fwrite(costs, sizeof(double), n, h);
|
||||
fclose(h);
|
||||
printf("Wrote the %i x %i data matrix successfully!\n", n, d);
|
||||
}
|
||||
|
||||
#endif
|
||||
49
third_party/bhtsne/tsne_test.cpp
vendored
Executable file
49
third_party/bhtsne/tsne_test.cpp
vendored
Executable file
@@ -0,0 +1,49 @@
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
|
||||
#include "tsne.h"
|
||||
|
||||
// Function that runs the Barnes-Hut implementation of t-SNE
|
||||
// simulate input data from code, print result to console
|
||||
// 3 dims ---> 2 dims for this test
|
||||
int main() {
|
||||
|
||||
// Define some variables
|
||||
int origN = 8, N = 8, D = 3, no_dims = 2, max_iter = 1000;
|
||||
double perplexity = 2, theta = 0.5;
|
||||
int rand_seed = -1;
|
||||
double data[] = {100, 23.9, 12,
|
||||
45.2, 46.9, 10,
|
||||
201, 44.8, 30,
|
||||
201, 40.2, 32,
|
||||
101, 7.8, 33,
|
||||
21, 4, 35,
|
||||
441, 4.8, 36,
|
||||
2.1, 32.4, 56};
|
||||
|
||||
N = origN;
|
||||
|
||||
// Now fire up the SNE implementation
|
||||
double* Y = (double*) malloc(N * no_dims * sizeof(double));
|
||||
if(Y == NULL) { printf("Memory allocation failed!\n"); exit(1); }
|
||||
|
||||
// data: input high dims of features, N * D
|
||||
// Y: output low dims of features, N * no_dims
|
||||
TSNE::run(data, N, D, Y, no_dims, perplexity, theta, rand_seed, false, max_iter, 250, 250);
|
||||
|
||||
// Print the results
|
||||
for (int i = 0; i < N; i++) {
|
||||
/* code */
|
||||
std::cout << Y[i] << "," << Y[i+1] << std::endl;
|
||||
}
|
||||
|
||||
// you can display result on 2D screen here
|
||||
|
||||
// Clean up the memory
|
||||
free(Y); Y = NULL;
|
||||
}
|
||||
272
third_party/bhtsne/vptree.h
vendored
Executable file
272
third_party/bhtsne/vptree.h
vendored
Executable file
@@ -0,0 +1,272 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the Delft University of Technology.
|
||||
* 4. Neither the name of the Delft University of Technology nor the names of
|
||||
* its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
||||
* OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* This code was adopted with minor modifications from Steve Hanov's great tutorial at http://stevehanov.ca/blog/index.php?id=130 */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include <queue>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
|
||||
|
||||
#ifndef VPTREE_H
|
||||
#define VPTREE_H
|
||||
|
||||
class DataPoint
|
||||
{
|
||||
int _ind;
|
||||
|
||||
public:
|
||||
double* _x;
|
||||
int _D;
|
||||
DataPoint() {
|
||||
_D = 1;
|
||||
_ind = -1;
|
||||
_x = NULL;
|
||||
}
|
||||
DataPoint(int D, int ind, double* x) {
|
||||
_D = D;
|
||||
_ind = ind;
|
||||
_x = (double*) malloc(_D * sizeof(double));
|
||||
for(int d = 0; d < _D; d++) _x[d] = x[d];
|
||||
}
|
||||
DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything
|
||||
if(this != &other) {
|
||||
_D = other.dimensionality();
|
||||
_ind = other.index();
|
||||
_x = (double*) malloc(_D * sizeof(double));
|
||||
for(int d = 0; d < _D; d++) _x[d] = other.x(d);
|
||||
}
|
||||
}
|
||||
~DataPoint() { if(_x != NULL) free(_x); }
|
||||
DataPoint& operator= (const DataPoint& other) { // asignment should free old object
|
||||
if(this != &other) {
|
||||
if(_x != NULL) free(_x);
|
||||
_D = other.dimensionality();
|
||||
_ind = other.index();
|
||||
_x = (double*) malloc(_D * sizeof(double));
|
||||
for(int d = 0; d < _D; d++) _x[d] = other.x(d);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
int index() const { return _ind; }
|
||||
int dimensionality() const { return _D; }
|
||||
double x(int d) const { return _x[d]; }
|
||||
};
|
||||
|
||||
double euclidean_distance(const DataPoint &t1, const DataPoint &t2) {
|
||||
double dd = .0;
|
||||
double* x1 = t1._x;
|
||||
double* x2 = t2._x;
|
||||
double diff;
|
||||
for(int d = 0; d < t1._D; d++) {
|
||||
diff = (x1[d] - x2[d]);
|
||||
dd += diff * diff;
|
||||
}
|
||||
return sqrt(dd);
|
||||
}
|
||||
|
||||
|
||||
template<typename T, double (*distance)( const T&, const T& )>
|
||||
class VpTree
|
||||
{
|
||||
public:
|
||||
|
||||
// Default constructor
|
||||
VpTree() : _root(0) {}
|
||||
|
||||
// Destructor
|
||||
~VpTree() {
|
||||
delete _root;
|
||||
}
|
||||
|
||||
// Function to create a new VpTree from data
|
||||
void create(const std::vector<T>& items) {
|
||||
delete _root;
|
||||
_items = items;
|
||||
_root = buildFromPoints(0, items.size());
|
||||
}
|
||||
|
||||
// Function that uses the tree to find the k nearest neighbors of target
|
||||
void search(const T& target, int k, std::vector<T>* results, std::vector<double>* distances)
|
||||
{
|
||||
|
||||
// Use a priority queue to store intermediate results on
|
||||
std::priority_queue<HeapItem> heap;
|
||||
|
||||
// Variable that tracks the distance to the farthest point in our results
|
||||
_tau = DBL_MAX;
|
||||
|
||||
// Perform the search
|
||||
search(_root, target, k, heap);
|
||||
|
||||
// Gather final results
|
||||
results->clear(); distances->clear();
|
||||
while(!heap.empty()) {
|
||||
results->push_back(_items[heap.top().index]);
|
||||
distances->push_back(heap.top().dist);
|
||||
heap.pop();
|
||||
}
|
||||
|
||||
// Results are in reverse order
|
||||
std::reverse(results->begin(), results->end());
|
||||
std::reverse(distances->begin(), distances->end());
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> _items;
|
||||
double _tau;
|
||||
|
||||
// Single node of a VP tree (has a point and radius; left children are closer to point than the radius)
|
||||
struct Node
|
||||
{
|
||||
int index; // index of point in node
|
||||
double threshold; // radius(?)
|
||||
Node* left; // points closer by than threshold
|
||||
Node* right; // points farther away than threshold
|
||||
|
||||
Node() :
|
||||
index(0), threshold(0.), left(0), right(0) {}
|
||||
|
||||
~Node() { // destructor
|
||||
delete left;
|
||||
delete right;
|
||||
}
|
||||
}* _root;
|
||||
|
||||
|
||||
// An item on the intermediate result queue
|
||||
struct HeapItem {
|
||||
HeapItem( int index, double dist) :
|
||||
index(index), dist(dist) {}
|
||||
int index;
|
||||
double dist;
|
||||
bool operator<(const HeapItem& o) const {
|
||||
return dist < o.dist;
|
||||
}
|
||||
};
|
||||
|
||||
// Distance comparator for use in std::nth_element
|
||||
struct DistanceComparator
|
||||
{
|
||||
const T& item;
|
||||
DistanceComparator(const T& item) : item(item) {}
|
||||
bool operator()(const T& a, const T& b) {
|
||||
return distance(item, a) < distance(item, b);
|
||||
}
|
||||
};
|
||||
|
||||
// Function that (recursively) fills the tree
|
||||
Node* buildFromPoints( int lower, int upper )
|
||||
{
|
||||
if (upper == lower) { // indicates that we're done here!
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Lower index is center of current node
|
||||
Node* node = new Node();
|
||||
node->index = lower;
|
||||
|
||||
if (upper - lower > 1) { // if we did not arrive at leaf yet
|
||||
|
||||
// Choose an arbitrary point and move it to the start
|
||||
int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower;
|
||||
std::swap(_items[lower], _items[i]);
|
||||
|
||||
// Partition around the median distance
|
||||
int median = (upper + lower) / 2;
|
||||
std::nth_element(_items.begin() + lower + 1,
|
||||
_items.begin() + median,
|
||||
_items.begin() + upper,
|
||||
DistanceComparator(_items[lower]));
|
||||
|
||||
// Threshold of the new node will be the distance to the median
|
||||
node->threshold = distance(_items[lower], _items[median]);
|
||||
|
||||
// Recursively build tree
|
||||
node->index = lower;
|
||||
node->left = buildFromPoints(lower + 1, median);
|
||||
node->right = buildFromPoints(median, upper);
|
||||
}
|
||||
|
||||
// Return result
|
||||
return node;
|
||||
}
|
||||
|
||||
// Helper function that searches the tree
|
||||
void search(Node* node, const T& target, int k, std::priority_queue<HeapItem>& heap)
|
||||
{
|
||||
if(node == NULL) return; // indicates that we're done here
|
||||
|
||||
// Compute distance between target and current node
|
||||
double dist = distance(_items[node->index], target);
|
||||
|
||||
// If current node within radius tau
|
||||
if(dist < _tau) {
|
||||
if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results)
|
||||
heap.push(HeapItem(node->index, dist)); // add current node to result list
|
||||
if(heap.size() == k) _tau = heap.top().dist; // update value of tau (farthest point in result list)
|
||||
}
|
||||
|
||||
// Return if we arrived at a leaf
|
||||
if(node->left == NULL && node->right == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the target lies within the radius of ball
|
||||
if(dist < node->threshold) {
|
||||
if(dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first
|
||||
search(node->left, target, k, heap);
|
||||
}
|
||||
|
||||
if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child
|
||||
search(node->right, target, k, heap);
|
||||
}
|
||||
|
||||
// If the target lies outsize the radius of the ball
|
||||
} else {
|
||||
if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first
|
||||
search(node->right, target, k, heap);
|
||||
}
|
||||
|
||||
if (dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child
|
||||
search(node->left, target, k, heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user