<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>So much to do, so little time</title>
	<atom:link href="http://rguha.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://rguha.wordpress.com</link>
	<description>Trying to squeeze sense out of chemical data</description>
	<lastBuildDate>Sun, 03 Apr 2011 16:33:56 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='rguha.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>So much to do, so little time</title>
		<link>http://rguha.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://rguha.wordpress.com/osd.xml" title="So much to do, so little time" />
	<atom:link rel='hub' href='http://rguha.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Blog Moved to http://blog.rguha.net</title>
		<link>http://rguha.wordpress.com/2009/02/02/blog-moved-to-httpblogrguhanet/</link>
		<comments>http://rguha.wordpress.com/2009/02/02/blog-moved-to-httpblogrguhanet/#comments</comments>
		<pubDate>Mon, 02 Feb 2009 06:25:10 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=293</guid>
		<description><![CDATA[Over the last few days I&#8217;ve started moving my web material to a new host. As a result I&#8217;ve decided to consolidate my blog there as well. So, I won&#8217;t be posting here anymore. The new blog is located at http://blog.rguha.net and carries over all the posts, tags and comments.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=293&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Over the last few days I&#8217;ve started moving my web material to a new host. As a result I&#8217;ve decided to consolidate my blog there as well. So, I won&#8217;t be posting here anymore. The new blog is located at <a href="http://blog.rguha.net">http://blog.rguha.net</a> and carries over all the posts, tags and comments.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/293/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/293/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/293/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=293&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/02/02/blog-moved-to-httpblogrguhanet/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Papers About Systems You Can&#8217;t Use or Buy</title>
		<link>http://rguha.wordpress.com/2009/02/01/papers-about-systems-you-cant-use-or-buy/</link>
		<comments>http://rguha.wordpress.com/2009/02/01/papers-about-systems-you-cant-use-or-buy/#comments</comments>
		<pubDate>Sun, 01 Feb 2009 03:02:39 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[Literature]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=291</guid>
		<description><![CDATA[Browsing the latest articles in JCIM, I came across one by Sander et al that discussed the design of a drug discovery informatics system employed at Actelion. The main claim to fame of the work appears to be the fact that it was built from scratch and so is vendor independent. While somewhat interesting, one [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=291&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">Browsing the latest articles in <a href="http://pubs.acs.org/journal/jcisd8">JCIM</a>, I came across one by <a href="http://dx.doi.org/10.1021/ci800305f">Sander et al</a> that discussed the design of a drug discovery informatics system employed at <a href="http://www.actelion.com/">Actelion</a>. The main claim to fame of the work appears to be the fact that it was built from scratch and so is vendor independent.</p>
<p style="text-align:justify;">While somewhat interesting, one question jumped out at me: <em>what is the value of this paper? </em>The system is specific to this one company so it&#8217;s not like I can access the code or the workflows. I can&#8217;t even buy this software. In this case, they do provide <a href="http://www.cheminformatics.ch/">public access</a> to some of their tools, so it&#8217;s not a totally &#8220;opaque&#8221; paper. But there are other <a href="http://dx.doi.org/10.1021/ci700267w">examples</a> where one cannot really even try out the tools described in the paper.</p>
<p style="text-align:justify;">While I see the value of the paper from the authors point of view (spreading the word, publications being &#8220;currency&#8221;, etc.), such papers have always felt a little pointless to me, as a reader. What can I do after reading this paper? Is there anything I can follow-up on?</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/291/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/291/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/291/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=291&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/02/01/papers-about-systems-you-cant-use-or-buy/feed/</wfw:commentRss>
		<slash:comments>13</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Getting the GO into a Graph Data Structure</title>
		<link>http://rguha.wordpress.com/2009/01/31/getting-the-go-into-a-graph-data-structure/</link>
		<comments>http://rguha.wordpress.com/2009/01/31/getting-the-go-into-a-graph-data-structure/#comments</comments>
		<pubDate>Sat, 31 Jan 2009 01:34:08 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[software]]></category>
		<category><![CDATA[go]]></category>
		<category><![CDATA[network]]></category>
		<category><![CDATA[python]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=287</guid>
		<description><![CDATA[Today while working on a project I needed to get access to the Gene Ontology hierarchy. While there a number of GO browsers such as Amigo, I needed access to the raw data to generate a graph that I could then slice and dice. A few minutes with Python led to a simple solution. The [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=287&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">Today while working on a project I needed to get access to the <a href="http://www.geneontology.org/">Gene Ontology</a> hierarchy. While there a number of GO browsers such as <a href="http://amigo.geneontology.org/cgi-bin/amigo/go.cgi">Amigo,</a> I needed access to the raw data to generate a graph that I could then slice and dice. A few minutes with Python led to a simple <a href="http://cheminfo.informatics.indiana.edu/~rguha/code/python/goproc.py">solution</a>.</p>
<p style="text-align:justify;">The program parses the <a href="http://www.obofoundry.org/">OBO</a> 1.2 formatted GO data file (either by directly downloading it or from a local file) and outputs a flat dictionary listing the term ID&#8217;s, names, namespace etc and a network representation of the GO hierarchy in <a href="http://lgl.sourceforge.net/#FileFormat">ncol</a> format. It uses a simple  (and relatively non-robust) class to represent the data as an undirected graph (not really correct), though it&#8217;d be easy to use something like <a href="http://cneurocvs.rmki.kfki.hu/igraph/">igraph</a> to start doing some real network analysis. It&#8217;s certainly not a comprehensive solution, but I thought I&#8217;d put it out there.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/287/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/287/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/287/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=287&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/31/getting-the-go-into-a-graph-data-structure/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Annotating Bioassays</title>
		<link>http://rguha.wordpress.com/2009/01/25/annotating-bioassays/</link>
		<comments>http://rguha.wordpress.com/2009/01/25/annotating-bioassays/#comments</comments>
		<pubDate>Sun, 25 Jan 2009 17:03:06 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[cheminformatics]]></category>
		<category><![CDATA[text mining]]></category>
		<category><![CDATA[visualization]]></category>
		<category><![CDATA[annotation]]></category>
		<category><![CDATA[database]]></category>
		<category><![CDATA[go]]></category>
		<category><![CDATA[network]]></category>
		<category><![CDATA[pubchem]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=274</guid>
		<description><![CDATA[I&#8217;ve been working for some time with the PubChem Bioassay collection &#8211; a set of 1293 assays that cover a range of techniques (enzymatic, phenotypic etc.), targets and sizes (from 20 molecules to 200,000 molecules). In addition, some assays are primary, high-throughput assays whereas a number of them are smaller, confirmatory assays. While an extremely [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=274&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">I&#8217;ve been working for <a href="http://www.springerlink.com/content/bu042m48032l4171/">some</a> <a href="http://cran.r-project.org/web/packages/rpubchem/index.html">time</a> with the PubChem <a href="http://www.ncbi.nlm.nih.gov/sites/entrez?db=pcassay">Bioassay</a> collection &#8211; a set of 1293 assays that cover a range of techniques (enzymatic, phenotypic etc.), targets and sizes (from 20 molecules to 200,000 molecules). In addition, some assays are primary, high-throughput assays whereas a number of them are smaller, confirmatory assays. While an extremely valuable collection, one of the drawbacks is the lack of curation. This has led to some people saying that the data is too noisy to be useful. Yes, the noise is a problem, but I think there&#8217;s still useful data to extract and model.</p>
<p style="text-align:justify;">One of the problems that I have faced is that while one can perform a full text search for assays on PubChem, there is no form of annotations on the assays themselves. One effect of this is that it is difficult to link an assay to other biological resources (though for enzymatic assays, one can determine a Pubmed protein identifier). While working on my <a href="http://www.slideshare.net/rguha/pubchem-bioassays-as-a-source-of-polypharmacology-presentation">bioassay network</a> project, I needed annotations and I didn&#8217;t want to do it manually.</p>
<p style="text-align:justify;"><span id="more-274"></span></p>
<h3 style="text-align:justify;">Automated GO Annotations</h3>
<p style="text-align:justify;">While manual annotations would be the most rigorous, I wanted to see to what extent automated methods could provide useful information. A bit of searching led me to <a href="http://eagl.unige.ch/GOCat/">GOCat</a> run by <a href="http://www.natlang.hcuge.ch/People/ruch/">Patrick Ruch</a>, which analyzes text and predicts a set of <a href="http://www.geneontology.org/">GO</a> terms that could be associated with that text. Working with Patrick and his student Julien Gobeill, we were able to process the description field of each assay to generate a set of &#8220;predicted GO terms&#8221;. The tool identifies each term as being related to function, component or pathway. and also provides a score for each predicted term, allowing us to rank the terms associated with an assay.</p>
<h3 style="text-align:justify;">Verification</h3>
<p style="text-align:justify;">The first thing to consider was correctness &#8211; are the assigned GO terms sensible? I manually verified about 30 assays and for many of them the top five terms that GOCat assigned were quite relevant (i.e.,  specific terms, that would be close to or actual leaf nodes in the GO hierarchy). In general, if one considered the top 15 or 20 terms, the relevancy was quite high. However, there were a number of assays for which the assigned terms were very general and thus, not very informative. It turns out that most of these arose from assays in which the description field was very <a href="http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=200">general</a> or <a href="http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=987">very</a> <a href="http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=773">short</a>, though there were some <a href="http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=635">examples</a> where there was a long description but the resultant predicted terms were still quite general and non-speciific. Of course, this process of verification is subjective, and one can be more or less strict as to whether predicted terms make sense for a given assay.</p>
<p style="text-align:justify;">Overall, the results aren&#8217;t too bad, but this was a quick check that was included in an abstract submission.  More extensive verification (rigorous quality score, multiple people) are in the pipeline as we write up the paper.</p>
<h3 style="text-align:justify;">Browsing</h3>
<p style="text-align:justify;">But as we proceed with verification, we are now able to link the bioassays with other biological resources via GO terms. After Patrick and Julien generated the predicted terms I put them into a Postgres database and whipped up a simple <a href="http://rguha.ath.cx/~rguha/cicc/goaid/query.html">interface</a>. It&#8217;s not complete but does let you view the terms associated with an assay to varying levels of detail (such as all terms, component-related terms etc).</p>
<p style="text-align:justify;"><a href="http://rguha.files.wordpress.com/2009/01/goaidss.png"><img class="aligncenter size-medium wp-image-276" title="goaidss" src="http://rguha.files.wordpress.com/2009/01/goaidss.png?w=300&#038;h=209" alt="goaidss" width="300" height="209" /></a></p>
<h3 style="text-align:justify;">Networks &amp; Visualization</h3>
<p style="text-align:justify;">But the whole point of this exercise was to help me in my bioassay network project. More specifically I wanted to create a semantic similarity network of assays. That is, a network where the nodes are assays and two nodes are connected if the &#8220;semantic similarity&#8221; between two assays is greater than some cutoff. With the assigned GO terms, we can define a simplistic semantic similarity score by counting the number of terms in  common between the top 10 terms for two assays. With normalization this gives us a value between 0 and 1. Now, such a similarity score is quite arbitrary but provides some nice advantages as I noted in my <a href="http://www.slideshare.net/rguha/pubchem-bioassays-as-a-source-of-polypharmacology-presentation">presentation</a>. For now it leads to some nice pictures, that I think also provide some  (high level) insight into the quality of the assigned terms. For example, using a cutoff of 0.9, we get a network that looks like Figure 1, where there are many disconnected components. Essentially, this is putting closely related assays together. This the large number of 2-node components appears to correspond to primary and secondary assay pairs. Wheres the larger clusters near the top tend to focus on groups of assays that have come from a specific center (such as the NCI DTP assays) or for a specific task (such as cytotoxicity assays). On going to lower cutoffs (Figures 2 and 3) we start seeing some interesting structures. As I noted, these are still just pretty pictures and I haven&#8217;t analysed them yet for meaning. But they do suggest these visualizations as a useful approach to exploring the assay collection compared with traditional full text search.</p>
<p style="text-align:justify;">Great collaborators, fresh data, pretty pictures, intriguing analyses &#8211; aah life is good!</p>
<p style="text-align:justify;">
<div id="attachment_275" class="wp-caption aligncenter" style="width: 310px"><a href="http://rguha.files.wordpress.com/2009/01/gosim9.png"><img class="size-medium wp-image-275" title="gosim9" src="http://rguha.files.wordpress.com/2009/01/gosim9.png?w=300&#038;h=199" alt="Semantic Similarity Network (Cutoff 0.9)" width="300" height="199" /></a><p class="wp-caption-text">Figure 1. Semantic Similarity Network (Cutoff 0.9)</p></div>
<div id="attachment_278" class="wp-caption aligncenter" style="width: 310px"><a href="http://rguha.files.wordpress.com/2009/01/gosim7.png"><img class="size-medium wp-image-278" title="gosim7" src="http://rguha.files.wordpress.com/2009/01/gosim7.png?w=300&#038;h=204" alt="Figure 3. Semantic Similarity Network (Cutoff = 0.7)" width="300" height="204" /></a><p class="wp-caption-text">Figure 2. Semantic Similarity Network (Cutoff = 0.7)</p></div>
<div id="attachment_279" class="wp-caption aligncenter" style="width: 310px"><a href="http://rguha.files.wordpress.com/2009/01/gosim6.png"><img class="size-medium wp-image-279" title="gosim6" src="http://rguha.files.wordpress.com/2009/01/gosim6.png?w=300&#038;h=204" alt="Figure 3. Semantic Similarity Network (Cutoff = 0.6)" width="300" height="204" /></a><p class="wp-caption-text">Figure 3. Semantic Similarity Network (Cutoff = 0.6)</p></div>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/274/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/274/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/274/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=274&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/25/annotating-bioassays/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/goaidss.png?w=300" medium="image">
			<media:title type="html">goaidss</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/gosim9.png?w=300" medium="image">
			<media:title type="html">gosim9</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/gosim7.png?w=300" medium="image">
			<media:title type="html">gosim7</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/gosim6.png?w=300" medium="image">
			<media:title type="html">gosim6</media:title>
		</media:content>
	</item>
		<item>
		<title>SALI Viewer Now on GitHub</title>
		<link>http://rguha.wordpress.com/2009/01/16/sali-viewer-now-on-github/</link>
		<comments>http://rguha.wordpress.com/2009/01/16/sali-viewer-now-on-github/#comments</comments>
		<pubDate>Fri, 16 Jan 2009 19:17:01 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[cheminformatics]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[graphviz]]></category>
		<category><![CDATA[network]]></category>
		<category><![CDATA[sali]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=266</guid>
		<description><![CDATA[Last year, John Van Drie and I published two papers (here and here) on the Structure Activity Landscape Index, (SALI) which is a way to view SAR data as a network of compounds. Along with the paper ,I put up a simple Java application (licensed under the LGPL) to generate and explore these networks. &#8211; [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=266&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">Last year, <a href="http://www.vandrieresearch.com/">John Van Drie</a> and I published two papers (<a href="http://dx.doi.org/10.1021/ci7004093">here</a> and <a href="http://dx.doi.org/10.1021/ci8001414">here</a>) on the Structure Activity Landscape Index, (SALI) which is a way to view SAR data as a network of compounds. Along with the paper ,I put up a simple Java application (licensed under the LGPL) to generate and explore these networks. &#8211; you only need to provide a file containing SMILES and activities. It&#8217;s based on <a href="http://zvtm.sourceforge.net/zgrviewer.html">ZGRViewer</a> &#8211; a very slick GUI for <a href="http://www.graphviz.org/">Graphviz</a> generated networks. I finally got around to reorganizing the code and putting it up on a GitHub <a href="http://github.com/rajarshi/saliviewer/tree/master">repository.</a> You can get more details of the application and the last stable version <a href="http://cheminfo.informatics.indiana.edu/~rguha/code/java/salivis/">here</a>.</p>
<p style="text-align:justify;"><a href="http://rguha.files.wordpress.com/2009/01/sali-ss.png"><img class="aligncenter size-medium wp-image-271" title="sali-ss" src="http://rguha.files.wordpress.com/2009/01/sali-ss.png?w=300&#038;h=198" alt="sali-ss" width="300" height="198" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/266/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/266/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/266/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=266&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/16/sali-viewer-now-on-github/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/sali-ss.png?w=300" medium="image">
			<media:title type="html">sali-ss</media:title>
		</media:content>
	</item>
		<item>
		<title>ONS Solubility Predictions</title>
		<link>http://rguha.wordpress.com/2009/01/14/ons-solubility-predictions/</link>
		<comments>http://rguha.wordpress.com/2009/01/14/ons-solubility-predictions/#comments</comments>
		<pubDate>Wed, 14 Jan 2009 21:31:05 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[software]]></category>
		<category><![CDATA[ons]]></category>
		<category><![CDATA[prediction]]></category>
		<category><![CDATA[qsar]]></category>
		<category><![CDATA[REST]]></category>
		<category><![CDATA[solubility]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=262</guid>
		<description><![CDATA[Using the model deployment and prediction service, I put up the two linear regression models I had built so far (described in more detail here) While REST is nice, a simple web page that allows you to paste a set of SMILES and get back predictions is handy. So I whipped together a simple interface [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=262&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">Using the model <a href="http://rguha.wordpress.com/2009/01/14/deploying-predictive-models/">deployment and prediction service</a>, I put up the two linear regression models I had built so far (described in more detail <a href="http://onschallenge.wikispaces.com/Predictive+Solubility">here</a>) While REST is nice, a simple web page that allows you to paste a set of SMILES and get back predictions is handy. So I whipped together a simple interface to the prediction service, allowing one to select a model, view the author-generated description and a get a nice (sortable!) table of predicted values. View it <a href="http://rguha.ath.cx/~rguha/cicc/jcsol/pred.html">here</a>. As noted in my previous <a href="http://rguha.wordpress.com/2009/01/14/deploying-predictive-models/">post</a> it&#8217;s not going to be very fast, but hopefully that will change in the future.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/262/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/262/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/262/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=262&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/14/ons-solubility-predictions/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Deploying Predictive Models</title>
		<link>http://rguha.wordpress.com/2009/01/14/deploying-predictive-models/</link>
		<comments>http://rguha.wordpress.com/2009/01/14/deploying-predictive-models/#comments</comments>
		<pubDate>Wed, 14 Jan 2009 21:23:50 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[cheminformatics]]></category>
		<category><![CDATA[cdk]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[qsar]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[REST]]></category>
		<category><![CDATA[rpy2]]></category>
		<category><![CDATA[solubility]]></category>
		<category><![CDATA[web service]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=260</guid>
		<description><![CDATA[Over the past few days I&#8217;ve been developing some predictive models in R, for the solubility data being generated as part of the ONS Solubility Challenge. As I develop the models I put up a brief summary of the results on the wiki. In the end however, we&#8217;d like to use these models to predict [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=260&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">Over the past few days I&#8217;ve been developing some predictive models in R, for the <a href="http://spreadsheets.google.com/ccc?key=plwwufp30hfq0udnEmRD1aQ&amp;hl=en">solubility data</a> being generated as part of the <a href="http://onschallenge.wikispaces.com/">ONS Solubility Challenge</a>. As I develop the models I put up a brief summary of the results on the <a href="http://onschallenge.wikispaces.com/Predictive+Solubility">wiki</a>. In the end however, we&#8217;d like to use these models to predict the solubility of untested compounds. While anybody can send me a SMILES string and get back a prediction, it&#8217;s more useful (and less work for me!) if a user can do it themselves. This requires that the models be deployed and made available as a web page or a service. Last year I developed a series of statistical web services based on R. The services were written in Java and are described in this <a href="http://dx.doi.org/10.1021/ci700188u">paper</a>. Since I&#8217;m working more with REST services these days, I wanted to see how easy it&#8217;d be to develop a model deployment system using Python, thus avoiding a multi-tiered system. With the help of <a href="http://rpy.sourceforge.net/rpy2.html">rpy2</a>, it turns out that this wasn&#8217;t very difficult.</p>
<h3 style="text-align:justify;"><span id="more-260"></span>Infrastructure</h3>
<p style="text-align:justify;">The setup is a <a href="http://www.modpython.org/">mod_python</a> based REST service. Before describing the service, a little bit about the models themselves. The setup requires that you develop a model in R and then save it as a binary R file (via <a href="http://stat.ethz.ch/R-manual/R-patched/library/base/html/save.html">save</a>). Right now you have to save the model in a variable called &#8220;model&#8221; &#8211; a bit restrictive but it might change in the future. You can build any type of model that has overloaded the <a href="http://stat.ethz.ch/R-manual/R-patched/library/stats/html/predict.html">predict</a> method. Once you have that you need to edit a model manifest file that contains information on the author, description of the model and so on. More importantly, you have to specify the descriptors used in the model. This leads to a limitation &#8211; the descriptor calculation step of the service uses the CDK <a href="http://rguha.wordpress.com/2009/01/11/update-to-the-rest-descriptor-services/">descriptor service</a> and so the models must employ the CDK descriptors. While restrictive it&#8217;s not too bad, since the CDK has a wide variety of <a href="http://cheminfo.informatics.indiana.edu/~rguha/code/java/nightly/dnames.html#molecule">molecular descriptors</a>. You can get more details about how models are deployed and the format of the manifest from the GitHub <a href="http://github.com/rajarshi/rest-ws/tree">repository</a>.</p>
<h3 style="text-align:justify;">Usage</h3>
<p style="text-align:justify;">With the model file and the manifest details it&#8217;s pretty easy to setup a simple Python service that uses rpy2 to load the model, calculate descriptors for an input SMILES (Base64 encoded), get a prediction and return it. Thus, to get a list of available models, visit</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/predict/">http://rguha.ath.cx/~rguha/cicc/rest/predict/</a></pre>
<p style="text-align:justify;">This gives a plain text page with a list of model identifiers You can then use a model identifier to get the details of the model (as provided by the author of the model) by appending the identifier. An example would be</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/predict/Solubility2">http://rguha.ath.cx/~rguha/cicc/rest/predict/Solubility2</a></pre>
<p style="text-align:justify;">Finally, to get a prediction from the above model, simply append a Base64 encoded SMILES string</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/predict/Solubility2/YzFjY2NjYzFDQ09DQw==">http://rguha.ath.cx/~rguha/cicc/rest/predict/Solubility2/YzFjY2NjYzFDQ09DQw==</a></pre>
<p style="text-align:justify;">and you end up with a plain text represtation of the predicted value.</p>
<h3 style="text-align:justify;">Caveats</h3>
<p style="text-align:justify;">Admittedly the current version of this service is a quick hack and has a number of restrictions. While any type of model can be deployed, something like a random forest model will require you to list many descriptors in the manifest file manually. In the future, this should probably be automated via an R function. While the manifest for a given model can contain an arbitrarily long description, it&#8217;s up to the developer to decide what goes in. Ideally, we&#8217;d serialize the model to <a href="http://www.dmg.org/">PMML</a> so that we could easily include details such as coefficients, training and validation statistics and so on. The use of PMML would allow easy inclusion in the manifest. On the other hand it&#8217;s relatively easy to extract this information from the model file, so it might simply require the construction of a different URL.</p>
<p style="text-align:justify;">Another drawback is the fact that one gets a single return value. Now, it&#8217;s pretty easy to extract, say, confidence limits but this is dependent on the nature of the model. Providing more information in the return value would probably best be handled by generating PMML output.</p>
<p style="text-align:justify;">The current format of the manifest file is pretty crude &#8211; ideally I&#8217;d use <a href="http://dublincore.org/">Dublin Core</a> to represent provenance and support more details of the model (such as model type etc), thus avoiding the need to load the model file. Also, there is no schema for the format, which would be a useful addition. Some form of versioning information would also be useful.</p>
<p style="text-align:justify;">One of the biggest performance bottlenecks is that the service deals with one SMILES string at a time. If you&#8217;re getting predictions for many molecules, this can become slow (since each prediction loads the model file). Ideally, the service would recognize a POST request and pull one or more SMILES from the fields in the request. This would allow predictions in bulk and make it much faster. Another advantage of use POST would be the ability to provide SDF (or any other multi-line) input.</p>
<h3 style="text-align:justify;">Conclusions</h3>
<p style="text-align:justify;">Model deployment is now simple to achieve and Python is sweet!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/260/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/260/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/260/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=260&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/14/deploying-predictive-models/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Improved CDK Depiction Service</title>
		<link>http://rguha.wordpress.com/2009/01/14/improved-cdk-depiction-service/</link>
		<comments>http://rguha.wordpress.com/2009/01/14/improved-cdk-depiction-service/#comments</comments>
		<pubDate>Wed, 14 Jan 2009 18:04:21 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[software]]></category>
		<category><![CDATA[Uncategorized]]></category>
		<category><![CDATA[cdk]]></category>
		<category><![CDATA[depiction]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=255</guid>
		<description><![CDATA[The folks at the EBI have been doing some great work on the CDK. A major effort is underway to revamp JChemPaint and part of this involves improving the rendering of 2D depictions. While not complete I rebuilt a version of the CDK 1.2.x branch with the latest rendering code from the jchempaint-primary branch and [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=255&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">The folks at the <a href="http://www.ebi.ac.uk/steinbeck/">EBI</a> have been doing some great work on the CDK. A major effort is underway to revamp <a href="http://apps.sourceforge.net/mediawiki/cdk/index.php?title=JChemPaint">JChemPaint</a> and part of this involves improving the rendering of 2D depictions. While not complete I rebuilt a version of the CDK 1.2.x branch with the latest rendering code from the <a href="http://cdk.svn.sourceforge.net/viewvc/cdk/cdk/branches/jchempaint-primary/">jchempaint-primary</a> branch and updated the CDK web service. The results are much nicer, though there&#8217;s scope for improvements. See for example</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/depict/c1ccccc1">http://rguha.ath.cx/~rguha/cicc/rest/depict/c1ccccc1</a></pre>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/depict/C1CCCCC12CCCCC2">http://rguha.ath.cx/~rguha/cicc/rest/depict/C1CCCCC12CCCCC2</a></pre>
<pre style="padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/depict/CC(=O)OC1=CC=CC=C1C(=O)O">http://rguha.ath.cx/~rguha/cicc/rest/depict/CC(=O)OC1=CC=CC=C1C(=O)O</a></pre>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/depict/c1ccccc1CC=CC%23N">http://rguha.ath.cx/~rguha/cicc/rest/depict/c1ccccc1CC=CC%23N</a></pre>
<p style="text-align:justify;">Thanks to Gilleain and Egon for pointing me in the right direction. Anybody using this service should see the new depictions automatically</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/255/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/255/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/255/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=255&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/14/improved-cdk-depiction-service/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Update to the REST Descriptor Services</title>
		<link>http://rguha.wordpress.com/2009/01/11/update-to-the-rest-descriptor-services/</link>
		<comments>http://rguha.wordpress.com/2009/01/11/update-to-the-rest-descriptor-services/#comments</comments>
		<pubDate>Sun, 11 Jan 2009 17:52:20 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[cheminformatics]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[base64]]></category>
		<category><![CDATA[descriptors]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[REST]]></category>
		<category><![CDATA[web service]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=250</guid>
		<description><![CDATA[The current version of the REST interface to the CDK descriptors allowed one to access descriptor values for a SMILES string by simply appending it to an URL, resulting in something like http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/ org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/c1ccccc1COCC This type of URL is pretty handy to construct by hand. However, as Pat Walters pointed out in the comments to [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=250&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">The current version of the <a href="http://rguha.wordpress.com/2009/01/07/playing-with-rest-descriptor-services/">REST interface </a>to the CDK descriptors allowed one to access descriptor values for a SMILES string by simply appending it to an URL, resulting in something like</p>
<pre style="padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/c1ccccc1COCC">http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/</a>
<a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/c1ccccc1COCC">org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/c1ccccc1COCC</a></pre>
<p style="text-align:justify;">This type of URL is pretty handy to construct by hand. However, as Pat Walters pointed out in the comments to that post, SMILES containing &#8216;#&#8217; will cause problems since that character is a URL fragment identifier. Furthermore, the presence of a &#8216;/&#8217; in a SMILES string necessitates some processing in the service to recognize it as part of the SMILES, rather than a URL path separator. While the service could handle these (at the expense of messy code) it turned out that there were subtle bugs.</p>
<p style="text-align:justify;">Based on Pats&#8217; suggestion I converted the service to use <a href="http://en.wikipedia.org/wiki/Base64">base64</a> encoded SMILES, which let me simplify the code and remove the bugs. As a result, one cannot append the SMILES directly to the URL&#8217;s. Instead the above URL would be rewritten in the form</p>
<pre style="padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/YzFjY2NjYzFDT0ND">http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/</a>
<a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/YzFjY2NjYzFDT0ND">org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/YzFjY2NjYzFDT0ND</a></pre>
<p style="text-align:justify;">All the example URL&#8217;s described in my previous <a href="http://rguha.wordpress.com/2009/01/07/playing-with-rest-descriptor-services/#comments">post</a> that involve SMILES strings, should be rewritten using base64 encoded SMILES. So to get a document listing all descriptors for &#8220;c1ccccc1COCC&#8221; one would write</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/YzFjY2NjYzFDT0ND">http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/YzFjY2NjYzFDT0ND</a></pre>
<p style="text-align:justify;">and then follow the links therein.</p>
<p style="text-align:justify;">While this makes it a little harder to directly write out these URL&#8217;s by hand, I expect that most uses of this service would be programmatic &#8211; in which case getting base64 encoded SMILES is trivial.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/250/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/250/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/250/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=250&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/11/update-to-the-rest-descriptor-services/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>
	</item>
		<item>
		<title>Playing with REST Descriptor Services</title>
		<link>http://rguha.wordpress.com/2009/01/07/playing-with-rest-descriptor-services/</link>
		<comments>http://rguha.wordpress.com/2009/01/07/playing-with-rest-descriptor-services/#comments</comments>
		<pubDate>Wed, 07 Jan 2009 07:06:23 +0000</pubDate>
		<dc:creator>Rajarshi Guha</dc:creator>
				<category><![CDATA[cheminformatics]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[cdk]]></category>
		<category><![CDATA[descriptor]]></category>
		<category><![CDATA[google]]></category>
		<category><![CDATA[javascript]]></category>
		<category><![CDATA[REST]]></category>
		<category><![CDATA[web service]]></category>

		<guid isPermaLink="false">http://rguha.wordpress.com/?p=245</guid>
		<description><![CDATA[As part of my work at IU I have been implementing a number of cheminformatics web services. Initially these were SOAP, but I realized that REST interfaces make life much easier. (also see here) As a result, a number of these services have simple REST interfaces. One such service provides molecular descriptor calculations, using the [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=245&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:justify;">As part of my work at IU I have been implementing a number of cheminformatics web services. Initially these were <a href="http://en.wikipedia.org/wiki/SOAP_(protocol)">SOAP</a>, but I realized that <a href="http://en.wikipedia.org/wiki/Representational_State_Transfer">REST</a> interfaces make life much easier. (also see <a href="http://www.petefreitag.com/item/431.cfm">here</a>) As a result, a <a href="http://www.chembiogrid.org/projects/proj_rest.html">number of these services</a> have simple REST interfaces. One such service provides molecular descriptor calculations, using the <a href="http://apps.sourceforge.net/mediawiki/cdk/index.php?title=Main_Page">CDK</a> as the backend. Thus by visiting  (i.e., making a HTTP GET request) a URL of the form</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors">http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/CC(=O)</a></pre>
<p style="text-align:justify;">you get a simple XML document containing a list of URL&#8217;s. Each URL represents a specific &#8220;resource&#8221;. In this context, the resource is the descriptor values for the given molecule. Thus by visiting</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/CC(=O)C">http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/</a>
<a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/CC(=O)C">org.openscience.cdk.qsar.descriptors.</a><a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.ALOGPDescriptor/CC(=O)C">molecular.ALOGPDescriptor/CC(=O)C</a></pre>
<p style="text-align:justify;">one gets another simple XML document that lists the names and values of the AlogP descriptor. In this case, the CDK implementation evaluates AlogP, AlogP2 and molar refractivity &#8211; so there are actually three descriptor values. On the other hand something like the  <a href="http://rguha.ath.cx/~rguha/cicc/rest/desc/descriptors/org.openscience.cdk.qsar.descriptors.molecular.WeightDescriptor/CC(=O)C">molecular weight</a> descriptor gives a single value. To just see the list of available descriptors visit</p>
<pre style="text-align:justify;padding-left:30px;"><a href="http://www.chembiogrid.org/cheminfo/rest/desc/descriptors">http://www.chembiogrid.org/cheminfo/rest/desc/descriptors</a></pre>
<p style="text-align:justify;">which gives an XML document containing a series of links. Visiting one of these links gives the &#8220;descriptor specification&#8221; &#8211; information on the vendor, version, reference to a descriptor ontology and so on.</p>
<p style="text-align:justify;">(I should point out that the descriptors available in this service are from a pretty old version of the CDK. I really should update the descriptors to the 1.2.x versions)</p>
<h3 style="text-align:justify;">Applications</h3>
<p style="text-align:justify;">This type of interface makes it easy to whip up various applications. One example is the <a href="http://rguha.wordpress.com/2008/12/30/the-ons-challenge-visualizing-chemical-space/">PCA analysis</a> of compound collections. Another one I put together today based on a conversation with <a href="http://www.chemistry.drexel.edu/people/bradley/bradley.asp">Jean-Claude</a> was a simple <a href="http://rguha.ath.cx/~rguha/cicc/jcsol/dplot.html">application</a> to plot pairs of descriptor values for a collection of SMILES.</p>
<p style="text-align:justify;"><a href="http://rguha.files.wordpress.com/2009/01/dppss1.png"><img class="aligncenter size-medium wp-image-247" title="dppss1" src="http://rguha.files.wordpress.com/2009/01/dppss1.png?w=300&#038;h=197" alt="dppss1" width="300" height="197" /></a></p>
<p style="text-align:justify;">The app is pretty simple (and quite slow, since it uses synchronous GET&#8217;s to the descriptor service for each SMILES and has to make two calls for each SMILES &#8211; hey, it was a quick hack!). Currently, it&#8217;s a bit restrictive &#8211; if a descriptor calculates multiple values, it will only use the first value. To see how many values a molecular descriptor calculates, see the list <a href="http://cheminfo.informatics.indiana.edu/~rguha/code/java/nightly/dnames.html#molecule">here</a>.</p>
<p style="text-align:justify;">With a little more effort one could easily have a pretty nice online descriptor calculation application rivaling a standalone application such as the the CDK descriptor <a href="http://cheminfo.informatics.indiana.edu/~rguha/code/java/cdkdesc.html">GUI</a></p>
<p style="text-align:justify;">Also,if you struggle with nice CSS layouts, the <a href="http://layouts.ironmyers.com/">CSS Layout Collection</a> is a fantastic resource. And <a href="http://jquery.com/">jQuery</a> rocks.</p>
<p style="text-align:justify;">
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/rguha.wordpress.com/245/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/rguha.wordpress.com/245/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/rguha.wordpress.com/245/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=rguha.wordpress.com&amp;blog=4664940&amp;post=245&amp;subd=rguha&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://rguha.wordpress.com/2009/01/07/playing-with-rest-descriptor-services/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/867224a54ab2831c16fe2e97186e6a1a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">rguha</media:title>
		</media:content>

		<media:content url="http://rguha.files.wordpress.com/2009/01/dppss1.png?w=300" medium="image">
			<media:title type="html">dppss1</media:title>
		</media:content>
	</item>
	</channel>
</rss>
