View Javadoc
1   package ubic.basecode.ontology.ncbo;
2   
3   import org.apache.commons.lang3.StringUtils;
4   import org.apache.commons.lang3.exception.ExceptionUtils;
5   import org.slf4j.Logger;
6   import org.slf4j.LoggerFactory;
7   import org.w3c.dom.Document;
8   import org.w3c.dom.Element;
9   import org.w3c.dom.Node;
10  import org.w3c.dom.NodeList;
11  import org.xml.sax.SAXException;
12  import ubic.basecode.util.Configuration;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.ParserConfigurationException;
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.net.ConnectException;
20  import java.net.URL;
21  import java.util.Collection;
22  import java.util.TreeSet;
23  import java.util.regex.Matcher;
24  import java.util.regex.Pattern;
25  
26  /**
27   * Use the NCBO annotator to find ontology terms matching strings.
28   *
29   */
30  public class AnnotatorClient {
31  
32      // ONTOLOGY ACCRONYM
33      public static final String HP_ONTOLOGY = "HP";
34      public static final String DOID_ONTOLOGY = "DOID";
35  
36      private final static int MAX_TRIES = 3;
37  
38      private static Logger log = LoggerFactory.getLogger( AnnotatorClient.class );
39  
40      // this API_KEY needs to be added to properties
41      private static String API_KEY = Configuration.getString( "ncbo.api.key" );
42  
43      private static String ANNOTATOR_URL = "http://data.bioontology.org/annotator?";
44  
45      // set this to search other things than only DOID and HP
46      private static String ontologies = HP_ONTOLOGY + "," + DOID_ONTOLOGY;
47  
48      /**
49       *
50       * @param  term
51       * @return
52       * @throws ParserConfigurationException
53       * @throws SAXException
54       * @throws IllegalStateException
55       * @throws Exception
56       */
57      public static Collection<AnnotatorResponse> findTerm( String term )
58          throws IOException, ParserConfigurationException, IllegalStateException, SAXException {
59          if ( StringUtils.isBlank( API_KEY ) ) {
60              throw new IllegalStateException( "NCBO ncbo.api.key needs to be configured" );
61          }
62  
63          Collection<AnnotatorResponse> responsesFound = new TreeSet<>();
64  
65          String termClean = removeSpecialCharacters( term );
66  
67          if ( StringUtils.isBlank( termClean ) ) return responsesFound;
68  
69          String url = ANNOTATOR_URL + "apikey=" + API_KEY + "&max_level=0&ontologies=" + ontologies
70              + "&format=xml&text=" + termClean;
71  
72          if ( log.isDebugEnabled() ) log.debug( "request url: " + url );
73  
74          int tries = 0;
75  
76          InputStream response = null;
77          while ( response == null && tries < MAX_TRIES ) {
78              try {
79                  response = new URL( url ).openStream();
80              } catch ( IOException e ) {
81                  try {
82                      Thread.sleep( 10000 ); // long wait...
83                  } catch ( InterruptedException e1 ) {
84  
85                  }
86                  tries++;
87              }
88          }
89  
90          if ( response == null ) {
91              log.warn( "Failed to get a response for " + url + " (original query=" + term + ")" );
92              return responsesFound;
93          }
94  
95          DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
96          DocumentBuilder builder = factory.newDocumentBuilder();
97          Document document = builder.parse( response );
98          NodeList nodes = document.getElementsByTagName( "annotation" );
99  
100         // for each response receive, populate the return objects
101         for ( int temp = 0; temp < nodes.getLength(); temp++ ) {
102 
103             Node nNode = nodes.item( temp );
104             Element eElement = ( Element ) nNode;
105 
106             // this is what was found with the annotator
107             String valueUri = eElement.getElementsByTagName( "id" ).item( 0 ).getTextContent();
108 
109             // populates all synonym given for that specific term
110             for ( int i = 0; i < eElement.getElementsByTagName( "annotations" ).getLength(); i++ ) {
111 
112                 Element infoE = ( Element ) eElement.getElementsByTagName( "annotations" ).item( i );
113 
114                 String matchType = infoE.getElementsByTagName( "matchType" ).item( 0 ).getTextContent();
115                 String txtMatched = infoE.getElementsByTagName( "text" ).item( 0 ).getTextContent();
116                 String ontologyUsed = findOntologyUsed( valueUri );
117 
118                 Integer from = new Integer( infoE.getElementsByTagName( "from" ).item( 0 ).getTextContent() );
119                 Integer to = new Integer( infoE.getElementsByTagName( "to" ).item( 0 ).getTextContent() );
120 
121                 AnnotatorResponse annotatorResponse = new AnnotatorResponse( valueUri, matchType, txtMatched, from, to,
122                     ontologyUsed, termClean );
123 
124                 responsesFound.add( annotatorResponse );
125             }
126         }
127 
128         return responsesFound;
129     }
130 
131     /**
132      * FIXME only knows about HP and DOID
133      *
134      * @param  url
135      * @return
136      */
137     private static String findOntologyUsed( String url ) {
138 
139         if ( url.indexOf( HP_ONTOLOGY ) != -1 ) {
140             return HP_ONTOLOGY;
141         } else if ( url.indexOf( DOID_ONTOLOGY ) != -1 ) {
142             return DOID_ONTOLOGY;
143         }
144 
145         return "UNKNOWN";
146 
147     }
148 
149     /**
150      * return the label associated with an conceptid. FIXME why are we doing things this way, it must be terribly slow
151      *
152      * @param  ontologyId what virtual ontology to use
153      * @param  identifier the identifier, knows about: OMIM, DOID, MESH
154      * @return the label for that term, example : ABCD syndrome
155      */
156     public static String findLabelForIdentifier( String ontologyId, String identifier ) {
157 
158         if ( StringUtils.isBlank( API_KEY ) ) {
159             throw new IllegalStateException( "NCBO ncbo.api.key needs to be configured" );
160         }
161 
162         // Examples
163         // http://data.bioontology.org/ontologies/DOID
164         // http://data.bioontology.org/ontologies/DOID/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FDOID_8986
165 
166         // http://data.bioontology.org/ontologies/MESH
167         // http://data.bioontology.org/ontologies/MESH/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2FC000624633
168 
169         // http://data.bioontology.org/ontologies/OMIM
170 
171         // http://data.bioontology.org/ontologies/OMIM/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FOMIM%2F600374
172 
173         String url;
174 
175         // These Ontology identifiers could potentially be retrieved, using OntologyLookup
176         switch ( ontologyId ) {
177             case "OMIM":
178             case "MESH":
179                 url = "http://data.bioontology.org/ontologies/" + ontologyId
180                     + "/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2F"
181                     + identifier
182                     + "/?apikey=" + API_KEY + "&format=xml";
183                 break;
184             case "DOID":
185                 url = "http://data.bioontology.org/ontologies/" + ontologyId + "/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2F"
186                     + identifier
187                     + "/?apikey=" + API_KEY + "&format=xml";
188                 break;
189             default:
190                 throw new IllegalArgumentException( "Don't know how to deal with " + ontologyId );
191         }
192 
193         log.debug( url );
194 
195         for ( int i = 0; i < MAX_TRIES; i++ ) {
196             try ( InputStream response = new URL( url ).openStream() ) {
197                 return findLabel( response );
198             } catch ( ConnectException ce ) {
199                 try {
200                     Thread.sleep( 500 );
201                 } catch ( InterruptedException e ) {
202                 }
203             } catch ( Exception e ) {
204                 log.error( "Identifier: '" + identifier + "'" );
205                 log.error( ExceptionUtils.getStackTrace( e ) );
206             }
207         }
208         return null;
209     }
210 
211     /**
212      * using the response return the label associated with the request
213      */
214     private static String findLabel( InputStream response ) throws Exception {
215         DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
216         DocumentBuilder builder = factory.newDocumentBuilder();
217         try ( InputStream content = response ) {
218             Document document = builder.parse( content );
219             NodeList nodes = document.getElementsByTagName( "prefLabel" );
220             if ( nodes == null ) {
221                 log.debug( "No definition found" );
222                 return null;
223             }
224             String labelName = ( ( Element ) nodes.item( 0 ) ).getTextContent();
225 
226             return labelName;
227         }
228     }
229 
230     // this is an attempt to clean up the string from characters we dont want
231     public static String removeSpecialCharacters( String txt ) {
232 
233         String simpleTxt = txt.trim();
234 
235         // remove txt between ( and )
236         int index1 = simpleTxt.indexOf( "(" );
237         int index2 = simpleTxt.indexOf( ")" );
238         if ( index1 != -1 && index2 != -1 ) {
239             simpleTxt = simpleTxt.substring( 0, index1 ) + simpleTxt.substring( index2 + 1, simpleTxt.length() );
240         }
241 
242         // what to keep
243         Pattern pt = Pattern.compile( "[^\\w\\s-,]+" );
244         Matcher match = pt.matcher( simpleTxt );
245         while ( match.find() ) {
246             String s = match.group();
247             simpleTxt = simpleTxt.replaceAll( "\\" + s, "" );
248         }
249 
250         return simpleTxt.trim().replaceAll( "\\s+", "+" );
251     }
252 
253 }