Commit 17258d6c authored by Andrew Cohen's avatar Andrew Cohen
Browse files

removed spare parts

parent 0bf58d85
function [kGap Gap S idx] = GapSpectral(DistanceMatrix,nMaxClusters,bAlgorithmicInformationDistance)
if nargin<3
bAlgorithmicInformationDistance=1;
end
B = 50; % size of Monte Carlo distribution
if nMaxClusters>size(DistanceMatrix,1)
nMaxClusters = size(DistanceMatrix,1)-1;
end
D = Regularize(DistanceMatrix);
bound=D;
for i=1: size(bound,1)
bound(i,i)=NaN;
end
a = min(min(bound));%;
b = max(max(bound));
UV = a + (b-a)*rand(size (D,1),size (D,2),B); % uniform distribution
for k=1:nMaxClusters
if (1==k) %
% one happy cluster
idx = ones(size (D,1),1);
else
idx = SpectralCluster(D,k);
end
W(k)=WkSpectral(k,idx,D);
for ib =1:B
uni = UV(:,:,ib);
uni = Regularize(uni); % make uni a valid distance matrix
if (1==k) %
% one happy cluster
idx = ones(size (D,1),1);
else
idx = SpectralCluster(uni,k);
end
Wb(ib,k)=WkSpectral(k,idx,uni);;
end
Wkb = Wb(:,k);
lkb = log(Wkb);
if bAlgorithmicInformationDistance
Gap(k) = 1/B*sum(Wkb) - W(k);
sdk = std(Wkb,1);
else
Gap(k) = 1/B*sum(lkb) - log(W(k));
sdk = std(lkb,1);
end
S(k)=sdk * sqrt(1+1/B);
% Gap
% S
end
% figure
% errorbar( [1:nMaxClusters],Gap,S)
% set(gca,'XTick',[1:nMaxClusters])
k=1;
while ((k<nMaxClusters) && (Gap(k) < Gap(k+1)-S(k+1)))
k=k+1;
end
kGap=k;
if (kGap>1)
idx = SpectralCluster(D,kGap);
else
idx = ones(size (D,1),1);
end
function f = GetCount(words,strServer)
if ~exist('strServer','var')
strServer='wikipedia'
end
term=[ words{1} ];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
if strcmp(strServer,'pubmed')
f=Count.pubmed(term);
elseif strcmp(strServer,'amazon')
f=doQueryAmazon(term);
elseif strcmp(strServer,'google')
f=GetCountGoogle(term);
elseif strcmp(strServer,'reddit')
f=Count.reddit(term);
else
f=Count.wikipedia(term);
end
% f=max(f,1); % no zero counts - sends nwd to NaN
f=f+1;
end
% amazon no longer returning search totals as of 1/2019...
% function f=doQueryAmazon(term)
%
% URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
% for i=1:5
%
% str = urlread(URL);
% try
% %amazon
% idx=strfind(str,'results for <span>');
% idx=idx(1)-2;
% idxStart=idx;
% chPrev = str(idxStart-1);
% while(isNumeric(chPrev))
% idxStart=idxStart-1;
% chPrev = str(idxStart-1);
% end
% nx=str(idxStart:idx);
% f=str2double(nx);
% break;
% catch
% f=0;
% pause(1);
% end
% end
%
% end
function fX=GetSynthesizedCount(words)
k=length(words);
fX=0;
for i=1:2^k-1
binary=decimalToBinaryVector(i,k,'LSBFirst');
idx=find(binary);
if 0==mod(length(idx),2)
bAdd=-1;
% fprintf('-');
else
bAdd=1;
% fprintf('+')
end
% fprintf('%s',mat2str(idx));
fX=fX+bAdd*GetCount(words(idx),0);
end
% fprintf('\n');
function [D,degenerate] = Regularize(DistanceMatrix)
% turn output from NCD into well behaved distance matrix
D=DistanceMatrix;
% b = max(max(D));
% D=D/b;
for i=1:size (D,1)
for j= 1:i-1
D(i,j)= D(j,i);
end
D(i,i)=0;
end
bound=D;
for i=1: size(bound,1)
bound(i,i)=NaN;
end
a = min(min(bound));
if 0==a
degenerate=1;
else
degenerate=0;
end
function w=WkSpectral(k,idx,DistanceMatrix)
% called by GapSpectral
for r =1:k
% find points in cluster r
pr = find(idx==r);
D(r) = 0;
for i=1:size(pr,1)
for j=i:size(pr,1)
D(r) = D(r)+DistanceMatrix(pr(i),pr(j));
end
end
D(r) = D(r)/(2*size(pr,1)); %(2) in paper
end
w = sum(D);
tic
% goNWD
%
%
global strServer
strServer='amazon'
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
words1={'Macbeth','The Tempest','Othello','King Lear','Hamlet'...
'The Merchant of Venice','A Midsummer Nights Dream',...
'Much Ado About Nothing', 'Taming of the Shrew','Twelfth Night' }
words2={'Carrie','Salems Lot','The Shining','The Stand','The Dead Zone',...
'Firestarter','Cujo'}
words3={'Adventures of Huckleberry Finn','A Connecticut Yankee in King Arthurs Court','Life on the Mississippi'...
'Puddnhead Wilson'}
words4 = {'The Old Man and The Sea','The Sun Also Rises','For Whom the Bell Tolls','A Farewell To Arms'}
words5={'Anna Karenina','War and Peace','The Death of Ivan Ilyich'}
idxKey = [ones(1,length(words1)) 2*ones(1,length(words2)) 3*ones(1,length(words3)) 4*ones(1,length(words4)) 5*ones(1,length(words5))]
GetCount(words1)
GetCount(words2)
%
[d]=GetDistances([words1,words2,words3,words4,words5],idxKey);
idx=getDistancesPairs([words1,words2,words3,words4,words5],idxKey);
tic
% goNWD
%
%
strServer='wikipedia'
words1={'red','orange','yellow','green','blue','indigo'}
words2={'lion','tiger','bear','monkey','zebra','elephant','aardvark','lamb','fox','ape','dog'}
idxKey = [ones(1,length(words1)) 2*ones(1,length(words2))]
d=GetDistances([words1,words2],idxKey);
idx=getDistancesPairs([words1,words2],idxKey);
words1={'red','orange','yellow','green','blue','indigo','violet','purple','cyan','white'}
words2={'square','circle','rectangle','ellipse','triangle','rhombus'}
idxKey = [ones(1,length(words1)) 2*ones(1,length(words2))]
d=GetDistances([words1,words2],idxKey);
idx=getDistancesPairs([words1,words2],idxKey);
words1={'Barack Obama','Hillary Clinton','John Edwards','Joe Biden','Chris Dodd','Mike Gravel'}
words2={'John McCain','Mitt Romney','Mike Huckabee','Ron Paul','Fred Thompson','Alan Keyes'}
idxKey = [ones(1,length(words1)) 2*ones(1,length(words2))]
d=GetDistances([words1,words2],idxKey);
idx=getDistancesPairs([words1,words2],idxKey);
%
% words1={'Labidochromis caeruleus','Sciaenochromis fryeri','Betta splendens','Carassius auratus','Melanochromis cyaneorhabdos'}
% words2={'Ecsenius bicolor','Pictichromis paccagnellae','Amphiprion ocellaris ','Paracanthurus hepatus','Chromis viridis'}
% [d1,d2]=GetDistances(words1,words2);
% idx=getDistancesPairs(words1,words2);
toc
global strServer
strServer='pubmed'
%http://www.nature.com/nature/journal/v511/n7510/full/nature13595.html
% supp table 2, in ranked order
shizophrenia = {'rs1702294','rs11191419','rs2007044','rs4129585',...
'rs35518360'};
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3250464/
% table 1
leukemia = {'rs17483466','rs13397985','rs757978','rs2456449','rs735665',...
'rs783540','rs305061','rs391525','rs1036935','rs11083846'};
%alzheimers
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3365264/
% table 1
% http://www.nature.com/tp/journal/v2/n5/fig_tab/tp201245t1.html#figure-title
alzheimers={'rs4420638','rs7561528','rs17817600','rs3748140','rs12808148','rs6856768','rs11738335','rs1357692'};
% obesity
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2858696/
% table 1
obesity={'rs10926984','rs12145833','rs2783963','rs11127485','rs17150703','rs13278851'};
%
% % neuroblastoma
% % http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3716226/
% % table 3
% neuroblastoma={'rs6939340','rs4712653','rs6435862','rs3768716','rs7585356'};
% neuroblastoma 2
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2742373/table/T1/
neuroblastoma = {'rs6939340','rs4712653','rs9295536','rs3790171','rs7272481'};
% parkinsons
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3305333/
% table 2, rs entries, by pvalue
% now trying http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3852568/table/tbl03/
% p<.1% in 2/3
% parkinsons={'rs356219','rs10847864','rs1491942','rs947211','rs2390669'}
parkinsons={'rs356219','rs10847864','rs2942168','rs11724635'}
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3839234/
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3839234/table/T1/
als = {'rs2303565','rs1344642','rs2814707','rs3849942','rs2453556', 'rs1971791', 'rs8056742'};
words={ alzheimers;parkinsons;als;shizophrenia;leukemia;obesity;neuroblastoma}
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
d=[];
for i=1:length(words)
for j=1:length(words)
d(i,j)=NWD( [words{i},words{j}]);
end
end
dx=[];
for i=1:length(words)
for j=1:length(words)
dx(i,j)=d(i,j)-d(i,i);
end
end
xlswrite('rr.xls',[d dx])
path(path,'..\Gap')
% goTypical
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
% words1={'red','orange','yellow','green','blue','vermilion','chartreuse'}
% words1={'red','orange','yellow','green','blue','indigo','violet'}
words1={'red','orange','yellow','green','blue'}
% words1={'red','orange','yellow','green','blue','indigo','texas'}
dx=NWD(words1)
d=[];
for i=1:length(words1)
wordsX=words1;
wordsX(i)=[];
d(i)=NWD(wordsX);
end
warning off all
d=dx-d';
[K Gap S idx] = GapKM(d,length(words1)-1);
tic
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
nobels2014={'Isamu Akasaki', 'Hiroshi Amano', 'Shuji Nakamura','Eric Betzig', 'Stefan W. Hell', 'William E. Moerner','John O''Keefe', 'May-Britt Moser', 'Edvard I. Moser' }
physics={'Albert Einstein','Isaac Newton','Stephen Hawking','Nils Bohr','James Maxwell'}
chemistry={'Amedeo Avogadro','Louis Pasteur','Linus Pauling','Robert Boyle'}
medicine={'Elizabeth Blackwell','William Harvey','Carl Jung','Richard Lister'}
idxKey = [ones(1,length(nobels2014)) 2*ones(1,length(physics)) 2*ones(1,length(chemistry))...
2*ones(1,length(medicine))]
[d]=GetDistances([nobels2014,physics,chemistry,medicine],idxKey);
idx=getDistancesPairs([nobels2014,physics,chemistry,medicine],idxKey)
tic
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
words1={'Kolmogorov','Fermat','Hilbert', 'Godel', 'Riemann','Gauss'} % mathematicians
words2={'Einstein','Newton','Hawking','Bohr','Maxwell','Boltzmann',} %physicists
words3={'Freud','Pavlov','Skinner','Jung'} %psychologists
words4={'Turing','Lovelace','Knuth','Hopper'} % computer scientists
words5={'Darwin','Lamarck','Linnaeus','Mendel'} % biology
idxKey = [ones(1,length(words1)) 2*ones(1,length(words2)) 3*ones(1,length(words3)) 4*ones(1,length(words4)) 5*ones(1,length(words5))]
[d]=GetDistances([words1,words2,words3,words4,words5],idxKey);
idx=getDistancesPairs([words1,words2,words3,words4,words5],idxKey);
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 51 5.961
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,nQueryCount=49, nCacheCount=335
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 1 16 51 57.306
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,kGap=2
nQueryCount=66, nCacheCount=198
nCorrect=11
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 52 14.308
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,nQueryCount=30, nCacheCount=354
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 1 16 52 53.707
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,kGap=2
nQueryCount=51, nCacheCount=213
nCorrect=12
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 53 30.481
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,nQueryCount=48, nCacheCount=336
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 1 16 54 30.869
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,kGap=2
nQueryCount=66, nCacheCount=198
nCorrect=7
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 54 55.778
Labidochromis caeruleus,Sciaenochromis fryeri,Betta splendens,Carassius auratus,Melanochromis cyaneorhabdos,Ecsenius bicolor,Pictichromis paccagnellae,Amphiprion ocellaris ,Paracanthurus hepatus,Chromis viridis,nQueryCount=40, nCacheCount=240
[0 5;0 5]
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 17 48 59.115
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,nQueryCount=49, nCacheCount=335
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 26 17 49 47.104
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,kGap=2
nQueryCount=66, nCacheCount=198
nCorrect=11
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 17 50 3.128
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,nQueryCount=30, nCacheCount=354
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 26 17 50 38.946
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,kGap=2
nQueryCount=51, nCacheCount=213
nCorrect=12
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 17 51 15.729
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,nQueryCount=48, nCacheCount=336
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 26 17 52 16.695
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,kGap=2
nQueryCount=66, nCacheCount=198
nCorrect=7
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 17 52 38.784
Labidochromis caeruleus,Sciaenochromis fryeri,Betta splendens,Carassius auratus,Melanochromis cyaneorhabdos,Ecsenius bicolor,Pictichromis paccagnellae,Amphiprion ocellaris ,Paracanthurus hepatus,Chromis viridis,nQueryCount=40, nCacheCount=240
[0 5;0 5]
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 18 9 46.311
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,nQueryCount=150, nCacheCount=474
[0 6;6 0]
-------------------------
GetDistances::(pairs)
2014 5 26 18 9 48.012
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,kGap=4
nQueryCount=0, nCacheCount=330
nCorrect=7
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 18 10 50.192
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,nQueryCount=111, nCacheCount=513
[0 6;4 2]
-------------------------
GetDistances::(pairs)
2014 5 26 18 10 51.87
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,kGap=3
nQueryCount=0, nCacheCount=330
nCorrect=6
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 18 12 44.564
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,nQueryCount=150, nCacheCount=474
[1 5;6 0]
-------------------------
GetDistances::(pairs)
2014 5 26 18 12 46.22
Barack Obama,Hillary Clinton,John Edwards,Joe Biden,Chris Dodd,Mike Gravel,John McCain,Mitt Romney,Mike Huckabee,Ron Paul,Fred Thompson,Alan Keyes,kGap=2
nQueryCount=0, nCacheCount=330
nCorrect=8
-------------------------
GetDistances::(multiples) NW=5
2014 5 26 18 13 46.902
Labidochromis caeruleus,Sciaenochromis fryeri,Betta splendens,Carassius auratus,Melanochromis cyaneorhabdos,Ecsenius bicolor,Pictichromis paccagnellae,Amphiprion ocellaris ,Paracanthurus hepatus,Chromis viridis,nQueryCount=100, nCacheCount=340
[0 5;0 5]
-------------------------
GetDistances::(pairs)
2014 5 26 18 13 48.428
Labidochromis caeruleus,Sciaenochromis fryeri,Betta splendens,Carassius auratus,Melanochromis cyaneorhabdos,Ecsenius bicolor,Pictichromis paccagnellae,Amphiprion ocellaris ,Paracanthurus hepatus,Chromis viridis,kGap=3
nQueryCount=0, nCacheCount=225
nCorrect=7
-------------------------
GetDistances::(multiples) NW=5
2014 5 27 11 29 56.627
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,nQueryCount=0, nCacheCount=672
[6 0;3 3]
-------------------------
GetDistances::(pairs)
2014 5 27 11 29 58.3
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,kGap=1
nQueryCount=0, nCacheCount=396
nCorrect=10
-------------------------
GetDistances::(multiples) NW=5
2014 5 27 11 29 58.339
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,nQueryCount=0, nCacheCount=672
[6 0;2 4]
-------------------------
GetDistances::(pairs)
2014 5 27 11 30 0.027
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,kGap=2