View Javadoc
1   /*
2    * The basecode project
3    *
4    * Copyright (c) 2007-2019 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.ontology.jena;
20  
21  import com.hp.hpl.jena.datatypes.DatatypeFormatException;
22  import com.hp.hpl.jena.datatypes.xsd.XSDDateTime;
23  import com.hp.hpl.jena.ontology.OntModel;
24  import com.hp.hpl.jena.ontology.OntResource;
25  import com.hp.hpl.jena.rdf.model.*;
26  import com.hp.hpl.jena.shared.JenaException;
27  import com.hp.hpl.jena.util.iterator.ExtendedIterator;
28  import com.hp.hpl.jena.vocabulary.RDFS;
29  import org.apache.commons.lang3.StringUtils;
30  import org.apache.commons.lang3.time.StopWatch;
31  import org.apache.lucene.analysis.Analyzer;
32  import org.apache.lucene.analysis.en.EnglishAnalyzer;
33  import org.apache.lucene.analysis.standard.StandardAnalyzer;
34  import org.apache.lucene.document.Document;
35  import org.apache.lucene.document.Field;
36  import org.apache.lucene.document.Fieldable;
37  import org.apache.lucene.document.NumericField;
38  import org.apache.lucene.index.IndexReader;
39  import org.apache.lucene.index.IndexWriter;
40  import org.apache.lucene.index.IndexWriterConfig;
41  import org.apache.lucene.index.MultiReader;
42  import org.apache.lucene.queryParser.MultiFieldQueryParser;
43  import org.apache.lucene.queryParser.ParseException;
44  import org.apache.lucene.search.*;
45  import org.apache.lucene.store.Directory;
46  import org.apache.lucene.store.FSDirectory;
47  import org.apache.lucene.util.Version;
48  import org.slf4j.Logger;
49  import org.slf4j.LoggerFactory;
50  import ubic.basecode.ontology.search.OntologySearchException;
51  import ubic.basecode.util.Configuration;
52  
53  import javax.annotation.Nullable;
54  import java.io.IOException;
55  import java.nio.file.Path;
56  import java.nio.file.Paths;
57  import java.util.*;
58  import java.util.stream.Collectors;
59  import java.util.stream.Stream;
60  
61  import static ubic.basecode.ontology.jena.JenaUtils.as;
62  
63  /**
64   * A Lucene-based ontology indexer.
65   *
66   * @author pavlidis
67   */
68  class OntologyIndexer {
69  
70      private static final Logger log = LoggerFactory.getLogger( OntologyIndexer.class );
71  
72      /**
73       * Those are build-in fields that are always indexed.
74       */
75      private static final String
76          ID_FIELD = "_ID",
77          LOCAL_NAME_FIELD = "_LOCAL_NAME",
78          IS_CLASS_FIELD = "_IS_CLASS",
79          IS_INDIVIDUAL_FIELD = "_IS_INDIVIDUAL";
80  
81      public static class IndexableProperty {
82          private final Property property;
83          private final boolean analyzed;
84  
85          public IndexableProperty( Property property, boolean analyzed ) {
86              this.property = property;
87              this.analyzed = analyzed;
88          }
89  
90          public Property getProperty() {
91              return property;
92          }
93  
94          public boolean isAnalyzed() {
95              return analyzed;
96          }
97      }
98  
99      public static final Collection<IndexableProperty> DEFAULT_INDEXABLE_PROPERTIES;
100 
101     static {
102         DEFAULT_INDEXABLE_PROPERTIES = new HashSet<>();
103         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( RDFS.label, true ) );
104         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.id, true ) );
105         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasDbXref, true ) );
106         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasSynonym, true ) );
107         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasExactSynonym, true ) );
108         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasBroadSynonym, true ) );
109         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasNarrowSynonym, true ) );
110         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasRelatedSynonym, true ) );
111         DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( IAO.alternativeLabel, true ) );
112     }
113 
114     /**
115      * Obtain an ontology index with the default indexable properties.
116      */
117     @Nullable
118     public static SearchIndex getSubjectIndex( String name, Set<String> excludedFromStemming ) {
119         return getSubjectIndex( name, DEFAULT_INDEXABLE_PROPERTIES, excludedFromStemming );
120     }
121 
122     /**
123      * Obtain an index with default analyzer (English), or null if no index is available.
124      * <p>
125      * <b>DOES not create the index if it doesn't exist.</b>
126      */
127     @Nullable
128     public static SearchIndex getSubjectIndex( String name, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) {
129         log.debug( "Loading index for {}...", name );
130         try {
131             // we do not put this in the try-with-open because we want these to *stay* open
132             FSDirectory directory = FSDirectory.open( getIndexPath( name ).toFile() );
133             FSDirectory directoryStd = FSDirectory.open( getIndexPath( name + ".std" ).toFile() );
134             if ( !IndexReader.indexExists( directory ) ) {
135                 return null;
136             }
137             if ( !IndexReader.indexExists( directoryStd ) ) {
138                 return null;
139             }
140             return openIndex( directory, directoryStd, indexableProperties, excludedFromStemming );
141         } catch ( IOException e ) {
142             log.warn( "Index for {} could not be opened.", name, e );
143             return null;
144         }
145     }
146 
147     /**
148      * Index an ontology with the default indexable properties.
149      */
150     public static SearchIndex indexOntology( String name, OntModel model, Set<String> excludedFromStemming, boolean force ) throws JenaException, IOException {
151         return indexOntology( name, model, DEFAULT_INDEXABLE_PROPERTIES, excludedFromStemming, force );
152     }
153 
154     /**
155      * Loads or creates an index from an existing OntModel. Any existing index will loaded unless force=true. It will be
156      * created if there isn't one already, or if force=true.
157      */
158     public static SearchIndex indexOntology( String name, OntModel model, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming, boolean force ) throws JenaException, IOException {
159         if ( force ) {
160             return index( name, model, indexableProperties, excludedFromStemming );
161         }
162         SearchIndex index = getSubjectIndex( name, excludedFromStemming );
163         if ( index == null ) {
164             log.warn( "Index not found, or there was an error, re-indexing {}...", name );
165             return index( name, model, indexableProperties, excludedFromStemming );
166         }
167         log.debug( "A valid index for {} already exists, using", name );
168         return index;
169     }
170 
171     private static Path getIndexPath( String name ) {
172         if ( StringUtils.isBlank( name ) ) {
173             throw new IllegalArgumentException( "The ontology must have a suitable name for being indexed." );
174         }
175         String ontologyDir = Configuration.getString( "ontology.index.dir" ); // e.g., /something/gemmaData/compass
176         if ( StringUtils.isBlank( ontologyDir ) ) {
177             return Paths.get( System.getProperty( "java.io.tmpdir" ), "searchIndices", "ontology", name );
178         }
179         return Paths.get( ontologyDir, "ontology", name );
180     }
181 
182     /**
183      * Create an on-disk index from an existing OntModel. Any existing index will be deleted/overwritten.
184      */
185     private static SearchIndex index( String name, OntModel model, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) throws JenaException, IOException {
186         Directory dir = index( name, model, new EnglishAnalyzer( Version.LUCENE_36, EnglishAnalyzer.getDefaultStopSet(), excludedFromStemming ), getIndexPath( name ), indexableProperties );
187         // we need to also analyze using the Standard analyzer, which doesn't do stemming and allows wildcard.
188         Directory dirStd = index( name, model, new StandardAnalyzer( Version.LUCENE_36 ), getIndexPath( name + ".std" ), indexableProperties );
189         return openIndex( dir, dirStd, indexableProperties, excludedFromStemming );
190     }
191 
192     private static Directory index( String name, OntModel model, Analyzer analyzer, Path indexDir, Collection<IndexableProperty> indexableProperties ) throws IOException {
193         StopWatch timer = StopWatch.createStarted();
194         FSDirectory dir = FSDirectory.open( indexDir.toFile() );
195         log.debug( "Indexing {} to: {}...", name, indexDir );
196         IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_36, analyzer );
197         try ( IndexWriter indexWriter = new IndexWriter( dir, config ) ) {
198             indexWriter.deleteAll(); // start with clean slate.
199             assert 0 == indexWriter.numDocs();
200             Map<String, IndexableProperty> indexablePropertiesByField = indexableProperties.stream()
201                 .collect( Collectors.toMap( p -> p.getProperty().getURI(), p -> p ) );
202             ExtendedIterator<Resource> subjects = model.listSubjects()
203                 .filterDrop( new BnodeFilter<>() );
204             while ( subjects.hasNext() ) {
205                 Resource subject = subjects.next();
206                 String id = subject.getURI();
207                 Document doc = new Document();
208                 doc.add( new Field( ID_FIELD, id, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
209                 doc.add( new Field( LOCAL_NAME_FIELD, subject.getLocalName(), Field.Store.NO, Field.Index.NOT_ANALYZED ) );
210                 boolean isClass = as( subject, OntResource.class ).map( OntResource::isClass ).orElse( false );
211                 boolean isIndividual = as( subject, OntResource.class ).map( OntResource::isIndividual ).orElse( false );
212                 doc.add( new NumericField( IS_CLASS_FIELD ).setIntValue( isClass ? 1 : 0 ) );
213                 doc.add( new NumericField( IS_INDIVIDUAL_FIELD ).setIntValue( isIndividual ? 1 : 0 ) );
214                 for ( IndexableProperty prop : indexableProperties ) {
215                     StmtIterator listStatements = subject.listProperties( prop.property );
216                     while ( listStatements.hasNext() ) {
217                         Statement s = listStatements.next();
218                         String field = s.getPredicate().getURI();
219                         Fieldable f;
220                         if ( s.getObject().isLiteral() ) {
221                             Literal l = s.getObject().asLiteral();
222                             Object v;
223                             try {
224                                 v = l.getValue();
225                             } catch ( DatatypeFormatException e ) {
226                                 log.warn( "Invalid datatype for literal: {}", l, e );
227                                 continue;
228                             }
229                             if ( v instanceof String ) {
230                                 f = new Field( field, ( String ) v, Field.Store.NO, indexablePropertiesByField.get( field ).isAnalyzed() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED );
231                             } else if ( v instanceof Number ) {
232                                 NumericField nf = new NumericField( field );
233                                 if ( v instanceof Integer ) {
234                                     nf.setIntValue( ( Integer ) v );
235                                 } else if ( v instanceof Long ) {
236                                     nf.setLongValue( ( Long ) v );
237                                 } else if ( v instanceof Float ) {
238                                     nf.setFloatValue( ( Float ) v );
239                                 } else if ( v instanceof Double ) {
240                                     nf.setDoubleValue( ( Double ) v );
241                                 } else {
242                                     log.warn( "Skipping numeric literal of unsupported type: {}", l );
243                                     continue;
244                                 }
245                                 f = nf;
246                             } else if ( v instanceof XSDDateTime ) {
247                                 f = new NumericField( field )
248                                     .setLongValue( ( ( XSDDateTime ) v ).asCalendar().getTime().getTime() );
249                             } else if ( v instanceof Boolean ) {
250                                 f = new NumericField( field ).setIntValue( Boolean.TRUE.equals( v ) ? 1 : 0 );
251                             } else {
252                                 log.warn( "Skipping literal of unsupported type: {}", l );
253                                 continue;
254                             }
255                         } else if ( s.getObject().isURIResource() ) {
256                             // index the URI
257                             f = new Field( field, s.getObject().asResource().getURI(), Field.Store.NO, Field.Index.NOT_ANALYZED );
258                         } else {
259                             // could be a blank node
260                             continue;
261                         }
262                         doc.add( f );
263                     }
264                 }
265                 indexWriter.addDocument( doc );
266             }
267             indexWriter.commit();
268             log.debug( "Done indexing {} subjects of {} in {} s.", indexWriter.numDocs(), name, String.format( "%.2f", timer.getTime() / 1000.0 ) );
269         }
270         return dir;
271     }
272 
273     private static SearchIndex openIndex( Directory dir, Directory dirStd, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) throws IOException {
274         String[] searchableFields = Stream.concat( Stream.of( ID_FIELD, LOCAL_NAME_FIELD ), indexableProperties.stream().map( p -> p.property ).map( Resource::getURI ) )
275             .distinct()
276             .toArray( String[]::new );
277         return new LuceneSearchIndex( searchableFields, new MultiReader( IndexReader.open( dir ), IndexReader.open( dirStd ) ), new EnglishAnalyzer( Version.LUCENE_36, EnglishAnalyzer.getDefaultStopSet(), excludedFromStemming ) );
278     }
279 
280     private static class LuceneSearchIndex implements SearchIndex {
281 
282         private static final Logger log = LoggerFactory.getLogger( LuceneSearchIndex.class );
283 
284         private final String[] searchableFields;
285         private final IndexReader index;
286         private final Analyzer analyzer;
287 
288         public LuceneSearchIndex( String[] searchableFields, IndexReader index, Analyzer analyzer ) {
289             this.searchableFields = searchableFields;
290             this.index = index;
291             this.analyzer = analyzer;
292         }
293 
294         @Override
295         public List<JenaSearchResult> search( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
296             return search( model, queryString, null, maxResults );
297         }
298 
299         @Override
300         public List<JenaSearchResult> searchClasses( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
301             return search( model, queryString, NumericRangeFilter.newIntRange( IS_CLASS_FIELD, 1, 1, true, true ), maxResults );
302         }
303 
304         @Override
305         public List<JenaSearchResult> searchIndividuals( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
306             return search( model, queryString, NumericRangeFilter.newIntRange( IS_INDIVIDUAL_FIELD, 1, 1, true, true ), maxResults );
307         }
308 
309         private List<JenaSearchResult> search( OntModel model, String queryString, @Nullable Filter filter, int maxResults ) throws OntologySearchException {
310             if ( StringUtils.isBlank( queryString ) ) {
311                 throw new IllegalArgumentException( "Query cannot be blank" );
312             }
313             StopWatch timer = StopWatch.createStarted();
314             try {
315                 Query query = new MultiFieldQueryParser( Version.LUCENE_36, searchableFields, analyzer ).parse( queryString );
316                 // in general, results are found in both regular and std index, so we divide by 2 the initial capacity
317                 // we also have to double the number of hits to account for duplicates
318                 TopDocs hits = new IndexSearcher( index ).search( query, filter, maxResults * 2 );
319                 Set<String> seenIds = new HashSet<>( hits.totalHits / 2 );
320                 List<JenaSearchResult> resources = new ArrayList<>( hits.totalHits / 2 );
321                 for ( int i = 0; i < hits.scoreDocs.length; i++ ) {
322                     Document doc = index.document( hits.scoreDocs[i].doc );
323                     String id = doc.get( ID_FIELD );
324                     if ( seenIds.contains( id ) ) {
325                         continue;
326                     }
327                     Resource res = model.getResource( id );
328                     resources.add( new JenaSearchResult( res, hits.scoreDocs[i].score ) );
329                     seenIds.add( id );
330                     if ( seenIds.size() >= maxResults ) {
331                         break;
332                     }
333                 }
334                 return resources;
335             } catch ( ParseException e ) {
336                 throw new OntologySearchException( "Failed to parse search query.", queryString, e );
337             } catch ( IOException e ) {
338                 throw new OntologySearchException( "An I/O error occured while searching.", queryString, e );
339             } finally {
340                 timer.stop();
341                 if ( timer.getTime() > 100 ) {
342                     log.warn( "Ontology resource search for: {} took {} ms.", queryString, timer.getTime() );
343                 }
344             }
345         }
346 
347         @Override
348         public void close() throws IOException {
349             index.close();
350         }
351     }
352 
353 }