Commit 14e63f51 authored by Andrew Cohen's avatar Andrew Cohen
Browse files

Merge remote-tracking branch 'origin/pubmed' into wikipedia_AND

Conflicts:
	results.txt
parents bef2f507 56265600
function f = GetCount(words)
global nQueryCount nCacheCount strServer
if isempty(strServer)
strServer='wikipedia'
end
term=[words{1}];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
nCacheCount=nCacheCount+1;
return
end
<<<<<<< HEAD
if strcmp(strServer,'pubmed')
f=doURLQueryPubmed(term);
elseif strcmp(strServer,'amazon')
f=doURLQueryAmazon(term);
else
f=doURLQueryWikipedia(term);
end
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
end
function f=doURLQueryWikipedia(term)
URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
str = urlread(URL);
try
idx=strfind(str,'div class="results-info"');
idx=idx(1);
str2=str(idx:end);
idx2=strfind(str2,'of <b>');
str3=str2(idx2+6:end);
idx3=strfind(str3,'</b>');
idx3=idx3(1);
nx=str3(1:idx3-1);
f=str2double(nx);
catch
f=0;
end
end
function f=doURLQueryPubmed(term)
URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
str = urlread(URL);
try
key = '<h2>About ';
idx1=strfind(str,key);
idx1=idx1+length(key);
idx2=strfind(str(idx1:end), 'search results for');
nx=str(idx1:idx1+idx2-2);
=======
% http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=blue+OR+elephant+&fulltext=Search
% URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
% http://www.ncbi.nlm.nih.gov/gquery/?term=WHSC1%2CNSD1%2CASH1L%2CSETD2%2C
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/protein/?term=' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
% WIKIPEDIA%
% idx=strfind(str,'div class="results-info"');
% idx=idx(1);
% str2=str(idx:end);
%
% idx2=strfind(str2,'of <b>');
% str3=str2(idx2+6:end);
%
% idx3=strfind(str3,'</b>');
% idx3=idx3(1);
%
% nx=str3(1:idx3-1);
% END WIKIPEDIA
>>>>>>> bef2f507c6358a5ceee1c070702fd6d9ada135b7
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
function f=doURLQueryAmazon(term)
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
function f = GetCount(words)
global nQueryCount nCacheCount
term=[words{1}];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
nCacheCount=nCacheCount+1;
return
end
% http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=blue+OR+elephant+&fulltext=Search
% URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
% http://www.ncbi.nlm.nih.gov/gquery/?term=WHSC1%2CNSD1%2CASH1L%2CSETD2%2C
URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
str = urlread(URL);
try
key = '<h2>About ';
idx1=strfind(str,key);
idx1=idx1+length(key);
idx2=strfind(str(idx1:end), 'search results for');
nx=str(idx1:idx1+idx2-2);
% idx=strfind(str,'div class="results-info"');
% WIKIPEDIA
% idx=idx(1);
% str2=str(idx:end);
%
% idx2=strfind(str2,'of <b>');
% str3=str2(idx2+6:end);
%
% idx3=strfind(str3,'</b>');
% idx3=idx3(1);
%
% nx=str3(1:idx3-1);
% END WIKIPEDIA
f=str2double(nx);
catch
f=0;
end
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
function f = GetCount(words)
global nQueryCount nCacheCount strServer
if isempty(strServer)
strServer='wikipedia'
end
term=[words{1}];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
nCacheCount=nCacheCount+1;
return
end
if strcmp(strServer,'pubmed')
f=doURLQueryPubmed(term);
elseif strcmp(strServer,'amazon')
f=doURLQueryAmazon(term);
else
f=doURLQueryWikipedia(term);
end
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
end
function f=doURLQueryWikipedia(term)
URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
str = urlread(URL);
try
idx=strfind(str,'div class="results-info"');
idx=idx(1);
str2=str(idx:end);
idx2=strfind(str2,'of <b>');
str3=str2(idx2+6:end);
idx3=strfind(str3,'</b>');
idx3=idx3(1);
nx=str3(1:idx3-1);
f=str2double(nx);
catch
f=0;
end
end
function f=doURLQueryPubmed(term)
URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
str = urlread(URL);
try
key = '<h2>About ';
idx1=strfind(str,key);
idx1=idx1+length(key);
idx2=strfind(str(idx1:end), 'search results for');
nx=str(idx1:idx1+idx2-2);
f=str2double(nx);
catch
f=0;
end
end
function f=doURLQueryAmazon(term)
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
function f = GetCount(words)
global nQueryCount nCacheCount
term=[words{1}];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
nCacheCount=nCacheCount+1;
return
end
% http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=blue+OR+elephant+&fulltext=Search
% URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
% http://www.ncbi.nlm.nih.gov/gquery/?term=WHSC1%2CNSD1%2CASH1L%2CSETD2%2C
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/protein/?term=' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
% WIKIPEDIA%
% idx=strfind(str,'div class="results-info"');
% idx=idx(1);
% str2=str(idx:end);
%
% idx2=strfind(str2,'of <b>');
% str3=str2(idx2+6:end);
%
% idx3=strfind(str3,'</b>');
% idx3=idx3(1);
%
% nx=str3(1:idx3-1);
% END WIKIPEDIA
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
function f = GetCount(words)
global nQueryCount nCacheCount strServer
if isempty(strServer)
strServer='wikipedia'
end
term=[words{1}];
for i=2:length(words)
term=[ term '+' words{i}];
end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
nCacheCount=nCacheCount+1;
return
end
<<<<<<< HEAD
if strcmp(strServer,'pubmed')
f=doURLQueryPubmed(term);
elseif strcmp(strServer,'amazon')
f=doURLQueryAmazon(term);
else
f=doURLQueryWikipedia(term);
end
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
end
function f=doURLQueryWikipedia(term)
URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
str = urlread(URL);
try
idx=strfind(str,'div class="results-info"');
idx=idx(1);
str2=str(idx:end);
idx2=strfind(str2,'of <b>');
str3=str2(idx2+6:end);
idx3=strfind(str3,'</b>');
idx3=idx3(1);
nx=str3(1:idx3-1);
f=str2double(nx);
catch
f=0;
end
end
function f=doURLQueryPubmed(term)
URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
str = urlread(URL);
try
key = '<h2>About ';
idx1=strfind(str,key);
idx1=idx1+length(key);
idx2=strfind(str(idx1:end), 'search results for');
nx=str(idx1:idx1+idx2-2);
=======
% http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=blue+OR+elephant+&fulltext=Search
% URL=['http://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
% http://www.ncbi.nlm.nih.gov/gquery/?term=WHSC1%2CNSD1%2CASH1L%2CSETD2%2C
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
% URL = [ 'http://www.ncbi.nlm.nih.gov/protein/?term=' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
% WIKIPEDIA%
% idx=strfind(str,'div class="results-info"');
% idx=idx(1);
% str2=str(idx:end);
%
% idx2=strfind(str2,'of <b>');
% str3=str2(idx2+6:end);
%
% idx3=strfind(str3,'</b>');
% idx3=idx3(1);
%
% nx=str3(1:idx3-1);
% END WIKIPEDIA
>>>>>>> bef2f507c6358a5ceee1c070702fd6d9ada135b7
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
function f=doURLQueryAmazon(term)
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span class="');
idx=idx(1)-2;
idxStart=idx;
while(str(idxStart)~=' ')
idxStart=idxStart-1;
end
nx=str(idxStart:idx);
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
% http://www.nature.com/nrd/journal/v11/n5/fig_tab/nrd3674_F2.html
% words1={'ELP3','KAT2B','MYST4','KAT5','HAT1'}; % PMTs
% words2={'CREBBP','EP300','MLL','PHIP','BRWD3'}; %BRDs
%http://www.nature.com/nature/journal/v511/n7510/full/nature13595.html
% supp table 2, in ranked order
shizophrenia = {'rs1702294','rs11191419','rs2007044','rs4129585',...
'rs35518360'};
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3250464/
% table 1
leukemia = {'rs17483466','rs13397985','rs757978','rs2456449','rs735665',...
'rs783540','rs305061','rs391525','rs1036935','rs11083846'};
%alzheimers
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3365264/
% table 1
% http://www.nature.com/tp/journal/v2/n5/fig_tab/tp201245t1.html#figure-title
alzheimers={'rs4420638','rs7561528','rs17817600','rs3748140','rs12808148','rs6856768','rs11738335','rs1357692'};
% obesity
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2858696/
% table 1
obesity={'rs10926984','rs12145833','rs2783963','rs11127485','rs17150703','rs13278851'};
%
% words1={'SwObihiro92','Leningrad91','Stockholm90','Yamagata89'}; % II
% words2={'SwNebraska92','SwStHyacinthe91','SwQuebec90','SwIowa88'};
% % neuroblastoma
% % http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3716226/
% % table 3
% neuroblastoma={'rs6939340','rs4712653','rs6435862','rs3768716','rs7585356'};
% neuroblastoma 2
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2742373/table/T1/
neuroblastoma = {'rs6939340','rs4712653','rs9295536','rs3790171','rs7272481'};
% parkinsons
% http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3305333/
% table 2, rs entries, by pvalue
% now trying http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3852568/table/tbl03/
% p<.1% in 2/3
% parkinsons={'rs356219','rs10847864','rs1491942','rs947211','rs2390669'}
parkinsons={'rs356219','rs10847864','rs2942168','rs11724635'}
% words1={'Cap-G','aub','vas','neb'}; % mitotic chromosome condensation
% words2={'br','velo','E2f','Dp','ctp'}; % cell death
words={ alzheimers;parkinsons;shizophrenia;leukemia;obesity;neuroblastoma}
% words1={'cytoskeleton' 'Actin','Arp2/3','Coronin','Dystrophin','FtsZ','Keratin'} % Cytoskeletal proteins
% words2={'adhesion' 'Cadherin','Ependymin','Integrin','NCAM','Selectin'} % Cell adhesion
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
% words1={ 'DRD2', 'GRM3', 'GRIN2A', 'SRR', 'CACNA1I', 'RIMS1', 'KCTD13'}
% words2={ 'LRRK2', 'PINK1', 'ATP13A2', 'MAPT', 'SNCA', 'HLA-DRB5', 'BST1','GAK'...
% 'ACMSD', 'STK39', 'MCCC1/LAMP3', 'SYT11', 'CCDC62/HIP1R'}
d=[];
for i=1:length(words)
for j=1:length(words)
d(i,j)=NWD( [words{i},words{j}]);
end
end
words1={'rs17483466','rs13397985','rs757978','rs872071','rs2456449','rs735665'}
words2={'rs4420638','rs7561528','rs17817600','rs3748140','rs12808148','rs6856768','rs11738335','rs1357692' }
[d1, d2]=GetDistances(words1,words2);
idx=getDistancesPairs(words1,words2);
dx=[];
for i=1:length(words)
for j=1:length(words)
dx(i,j)=d(i,j)-d(i,i);
end
end
xlswrite('rr.xls',[d dx])
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 51 5.961
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,nQueryCount=49, nCacheCount=335
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 1 16 51 57.306
red,orange,yellow,green,blue,indigo,lion,tiger,bear,monkey,zebra,elephant,kGap=2
nQueryCount=66, nCacheCount=198
nCorrect=11
-------------------------
GetDistances::(multiples) NW=5
2014 5 1 16 52 14.308
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,nQueryCount=30, nCacheCount=354
[6 0;0 6]
-------------------------
GetDistances::(pairs)
2014 5 1 16 52 53.707
red,orange,yellow,green,blue,indigo,square,circle,rectangle,ellipse,triangle,rhombus,kGap=2
nQueryCount=51, nCacheCount=213
nCorrect=12