1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package ubic.basecode.ontology.jena;
20
21 import com.hp.hpl.jena.datatypes.DatatypeFormatException;
22 import com.hp.hpl.jena.datatypes.xsd.XSDDateTime;
23 import com.hp.hpl.jena.ontology.OntModel;
24 import com.hp.hpl.jena.ontology.OntResource;
25 import com.hp.hpl.jena.rdf.model.*;
26 import com.hp.hpl.jena.shared.JenaException;
27 import com.hp.hpl.jena.util.iterator.ExtendedIterator;
28 import com.hp.hpl.jena.vocabulary.RDFS;
29 import org.apache.commons.lang3.StringUtils;
30 import org.apache.commons.lang3.time.StopWatch;
31 import org.apache.lucene.analysis.Analyzer;
32 import org.apache.lucene.analysis.en.EnglishAnalyzer;
33 import org.apache.lucene.analysis.standard.StandardAnalyzer;
34 import org.apache.lucene.document.Document;
35 import org.apache.lucene.document.Field;
36 import org.apache.lucene.document.Fieldable;
37 import org.apache.lucene.document.NumericField;
38 import org.apache.lucene.index.IndexReader;
39 import org.apache.lucene.index.IndexWriter;
40 import org.apache.lucene.index.IndexWriterConfig;
41 import org.apache.lucene.index.MultiReader;
42 import org.apache.lucene.queryParser.MultiFieldQueryParser;
43 import org.apache.lucene.queryParser.ParseException;
44 import org.apache.lucene.search.*;
45 import org.apache.lucene.store.Directory;
46 import org.apache.lucene.store.FSDirectory;
47 import org.apache.lucene.util.Version;
48 import org.slf4j.Logger;
49 import org.slf4j.LoggerFactory;
50 import ubic.basecode.ontology.search.OntologySearchException;
51 import ubic.basecode.util.Configuration;
52
53 import javax.annotation.Nullable;
54 import java.io.IOException;
55 import java.nio.file.Path;
56 import java.nio.file.Paths;
57 import java.util.*;
58 import java.util.stream.Collectors;
59 import java.util.stream.Stream;
60
61 import static ubic.basecode.ontology.jena.JenaUtils.as;
62
63
64
65
66
67
68 class OntologyIndexer {
69
70 private static final Logger log = LoggerFactory.getLogger( OntologyIndexer.class );
71
72
73
74
75 private static final String
76 ID_FIELD = "_ID",
77 LOCAL_NAME_FIELD = "_LOCAL_NAME",
78 IS_CLASS_FIELD = "_IS_CLASS",
79 IS_INDIVIDUAL_FIELD = "_IS_INDIVIDUAL";
80
81 public static class IndexableProperty {
82 private final Property property;
83 private final boolean analyzed;
84
85 public IndexableProperty( Property property, boolean analyzed ) {
86 this.property = property;
87 this.analyzed = analyzed;
88 }
89
90 public Property getProperty() {
91 return property;
92 }
93
94 public boolean isAnalyzed() {
95 return analyzed;
96 }
97 }
98
99 public static final Collection<IndexableProperty> DEFAULT_INDEXABLE_PROPERTIES;
100
101 static {
102 DEFAULT_INDEXABLE_PROPERTIES = new HashSet<>();
103 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( RDFS.label, true ) );
104 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.id, true ) );
105 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasDbXref, true ) );
106 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasSynonym, true ) );
107 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasExactSynonym, true ) );
108 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasBroadSynonym, true ) );
109 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasNarrowSynonym, true ) );
110 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( OBO.hasRelatedSynonym, true ) );
111 DEFAULT_INDEXABLE_PROPERTIES.add( new IndexableProperty( IAO.alternativeLabel, true ) );
112 }
113
114
115
116
117 @Nullable
118 public static SearchIndex getSubjectIndex( String name, Set<String> excludedFromStemming ) {
119 return getSubjectIndex( name, DEFAULT_INDEXABLE_PROPERTIES, excludedFromStemming );
120 }
121
122
123
124
125
126
127 @Nullable
128 public static SearchIndex getSubjectIndex( String name, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) {
129 log.debug( "Loading index for {}...", name );
130 try {
131
132 FSDirectory directory = FSDirectory.open( getIndexPath( name ).toFile() );
133 FSDirectory directoryStd = FSDirectory.open( getIndexPath( name + ".std" ).toFile() );
134 if ( !IndexReader.indexExists( directory ) ) {
135 return null;
136 }
137 if ( !IndexReader.indexExists( directoryStd ) ) {
138 return null;
139 }
140 return openIndex( directory, directoryStd, indexableProperties, excludedFromStemming );
141 } catch ( IOException e ) {
142 log.warn( "Index for {} could not be opened.", name, e );
143 return null;
144 }
145 }
146
147
148
149
150 public static SearchIndex indexOntology( String name, OntModel model, Set<String> excludedFromStemming, boolean force ) throws JenaException, IOException {
151 return indexOntology( name, model, DEFAULT_INDEXABLE_PROPERTIES, excludedFromStemming, force );
152 }
153
154
155
156
157
158 public static SearchIndex indexOntology( String name, OntModel model, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming, boolean force ) throws JenaException, IOException {
159 if ( force ) {
160 return index( name, model, indexableProperties, excludedFromStemming );
161 }
162 SearchIndex index = getSubjectIndex( name, excludedFromStemming );
163 if ( index == null ) {
164 log.warn( "Index not found, or there was an error, re-indexing {}...", name );
165 return index( name, model, indexableProperties, excludedFromStemming );
166 }
167 log.debug( "A valid index for {} already exists, using", name );
168 return index;
169 }
170
171 private static Path getIndexPath( String name ) {
172 if ( StringUtils.isBlank( name ) ) {
173 throw new IllegalArgumentException( "The ontology must have a suitable name for being indexed." );
174 }
175 String ontologyDir = Configuration.getString( "ontology.index.dir" );
176 if ( StringUtils.isBlank( ontologyDir ) ) {
177 return Paths.get( System.getProperty( "java.io.tmpdir" ), "searchIndices", "ontology", name );
178 }
179 return Paths.get( ontologyDir, "ontology", name );
180 }
181
182
183
184
185 private static SearchIndex index( String name, OntModel model, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) throws JenaException, IOException {
186 Directory dir = index( name, model, new EnglishAnalyzer( Version.LUCENE_36, EnglishAnalyzer.getDefaultStopSet(), excludedFromStemming ), getIndexPath( name ), indexableProperties );
187
188 Directory dirStd = index( name, model, new StandardAnalyzer( Version.LUCENE_36 ), getIndexPath( name + ".std" ), indexableProperties );
189 return openIndex( dir, dirStd, indexableProperties, excludedFromStemming );
190 }
191
192 private static Directory index( String name, OntModel model, Analyzer analyzer, Path indexDir, Collection<IndexableProperty> indexableProperties ) throws IOException {
193 StopWatch timer = StopWatch.createStarted();
194 FSDirectory dir = FSDirectory.open( indexDir.toFile() );
195 log.debug( "Indexing {} to: {}...", name, indexDir );
196 IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_36, analyzer );
197 try ( IndexWriter indexWriter = new IndexWriter( dir, config ) ) {
198 indexWriter.deleteAll();
199 assert 0 == indexWriter.numDocs();
200 Map<String, IndexableProperty> indexablePropertiesByField = indexableProperties.stream()
201 .collect( Collectors.toMap( p -> p.getProperty().getURI(), p -> p ) );
202 ExtendedIterator<Resource> subjects = model.listSubjects()
203 .filterDrop( new BnodeFilter<>() );
204 while ( subjects.hasNext() ) {
205 Resource subject = subjects.next();
206 String id = subject.getURI();
207 Document doc = new Document();
208 doc.add( new Field( ID_FIELD, id, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
209 doc.add( new Field( LOCAL_NAME_FIELD, subject.getLocalName(), Field.Store.NO, Field.Index.NOT_ANALYZED ) );
210 boolean isClass = as( subject, OntResource.class ).map( OntResource::isClass ).orElse( false );
211 boolean isIndividual = as( subject, OntResource.class ).map( OntResource::isIndividual ).orElse( false );
212 doc.add( new NumericField( IS_CLASS_FIELD ).setIntValue( isClass ? 1 : 0 ) );
213 doc.add( new NumericField( IS_INDIVIDUAL_FIELD ).setIntValue( isIndividual ? 1 : 0 ) );
214 for ( IndexableProperty prop : indexableProperties ) {
215 StmtIterator listStatements = subject.listProperties( prop.property );
216 while ( listStatements.hasNext() ) {
217 Statement s = listStatements.next();
218 String field = s.getPredicate().getURI();
219 Fieldable f;
220 if ( s.getObject().isLiteral() ) {
221 Literal l = s.getObject().asLiteral();
222 Object v;
223 try {
224 v = l.getValue();
225 } catch ( DatatypeFormatException e ) {
226 log.warn( "Invalid datatype for literal: {}", l, e );
227 continue;
228 }
229 if ( v instanceof String ) {
230 f = new Field( field, ( String ) v, Field.Store.NO, indexablePropertiesByField.get( field ).isAnalyzed() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED );
231 } else if ( v instanceof Number ) {
232 NumericField nf = new NumericField( field );
233 if ( v instanceof Integer ) {
234 nf.setIntValue( ( Integer ) v );
235 } else if ( v instanceof Long ) {
236 nf.setLongValue( ( Long ) v );
237 } else if ( v instanceof Float ) {
238 nf.setFloatValue( ( Float ) v );
239 } else if ( v instanceof Double ) {
240 nf.setDoubleValue( ( Double ) v );
241 } else {
242 log.warn( "Skipping numeric literal of unsupported type: {}", l );
243 continue;
244 }
245 f = nf;
246 } else if ( v instanceof XSDDateTime ) {
247 f = new NumericField( field )
248 .setLongValue( ( ( XSDDateTime ) v ).asCalendar().getTime().getTime() );
249 } else if ( v instanceof Boolean ) {
250 f = new NumericField( field ).setIntValue( Boolean.TRUE.equals( v ) ? 1 : 0 );
251 } else {
252 log.warn( "Skipping literal of unsupported type: {}", l );
253 continue;
254 }
255 } else if ( s.getObject().isURIResource() ) {
256
257 f = new Field( field, s.getObject().asResource().getURI(), Field.Store.NO, Field.Index.NOT_ANALYZED );
258 } else {
259
260 continue;
261 }
262 doc.add( f );
263 }
264 }
265 indexWriter.addDocument( doc );
266 }
267 indexWriter.commit();
268 log.debug( "Done indexing {} subjects of {} in {} s.", indexWriter.numDocs(), name, String.format( "%.2f", timer.getTime() / 1000.0 ) );
269 }
270 return dir;
271 }
272
273 private static SearchIndex openIndex( Directory dir, Directory dirStd, Collection<IndexableProperty> indexableProperties, Set<String> excludedFromStemming ) throws IOException {
274 String[] searchableFields = Stream.concat( Stream.of( ID_FIELD, LOCAL_NAME_FIELD ), indexableProperties.stream().map( p -> p.property ).map( Resource::getURI ) )
275 .distinct()
276 .toArray( String[]::new );
277 return new LuceneSearchIndex( searchableFields, new MultiReader( IndexReader.open( dir ), IndexReader.open( dirStd ) ), new EnglishAnalyzer( Version.LUCENE_36, EnglishAnalyzer.getDefaultStopSet(), excludedFromStemming ) );
278 }
279
280 private static class LuceneSearchIndex implements SearchIndex {
281
282 private static final Logger log = LoggerFactory.getLogger( LuceneSearchIndex.class );
283
284 private final String[] searchableFields;
285 private final IndexReader index;
286 private final Analyzer analyzer;
287
288 public LuceneSearchIndex( String[] searchableFields, IndexReader index, Analyzer analyzer ) {
289 this.searchableFields = searchableFields;
290 this.index = index;
291 this.analyzer = analyzer;
292 }
293
294 @Override
295 public List<JenaSearchResult> search( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
296 return search( model, queryString, null, maxResults );
297 }
298
299 @Override
300 public List<JenaSearchResult> searchClasses( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
301 return search( model, queryString, NumericRangeFilter.newIntRange( IS_CLASS_FIELD, 1, 1, true, true ), maxResults );
302 }
303
304 @Override
305 public List<JenaSearchResult> searchIndividuals( OntModel model, String queryString, int maxResults ) throws OntologySearchException {
306 return search( model, queryString, NumericRangeFilter.newIntRange( IS_INDIVIDUAL_FIELD, 1, 1, true, true ), maxResults );
307 }
308
309 private List<JenaSearchResult> search( OntModel model, String queryString, @Nullable Filter filter, int maxResults ) throws OntologySearchException {
310 if ( StringUtils.isBlank( queryString ) ) {
311 throw new IllegalArgumentException( "Query cannot be blank" );
312 }
313 StopWatch timer = StopWatch.createStarted();
314 try {
315 Query query = new MultiFieldQueryParser( Version.LUCENE_36, searchableFields, analyzer ).parse( queryString );
316
317
318 TopDocs hits = new IndexSearcher( index ).search( query, filter, maxResults * 2 );
319 Set<String> seenIds = new HashSet<>( hits.totalHits / 2 );
320 List<JenaSearchResult> resources = new ArrayList<>( hits.totalHits / 2 );
321 for ( int i = 0; i < hits.scoreDocs.length; i++ ) {
322 Document doc = index.document( hits.scoreDocs[i].doc );
323 String id = doc.get( ID_FIELD );
324 if ( seenIds.contains( id ) ) {
325 continue;
326 }
327 Resource res = model.getResource( id );
328 resources.add( new JenaSearchResult( res, hits.scoreDocs[i].score ) );
329 seenIds.add( id );
330 if ( seenIds.size() >= maxResults ) {
331 break;
332 }
333 }
334 return resources;
335 } catch ( ParseException e ) {
336 throw new OntologySearchException( "Failed to parse search query.", queryString, e );
337 } catch ( IOException e ) {
338 throw new OntologySearchException( "An I/O error occured while searching.", queryString, e );
339 } finally {
340 timer.stop();
341 if ( timer.getTime() > 100 ) {
342 log.warn( "Ontology resource search for: {} took {} ms.", queryString, timer.getTime() );
343 }
344 }
345 }
346
347 @Override
348 public void close() throws IOException {
349 index.close();
350 }
351 }
352
353 }