/** Select a diverse subset of structures from amongst the current results. * The algorithm is pretty simplistic, randomly selecting a structure, * checking if it is similar to one already in the pool, and if not then adding * to the pool. * Similarity scores are obtained from the results of an overlap analysis that * must previously have been run. * The results are written out to the console and can be copied, and then pasted as a new list. * * Steps: * 1. Run an overlap analysis using the same structure entity as query and target (Tools -> Chemistry -> Overlap analysis). * Specify the similarity threshold you want to use to to exclude molecules. * 2. Edit the parameters in the 'edit these settings' section * 3. Run the script * * @author Tim Dudgeon ([email protected]) */ import com.im.commons.progress.* // ---------- edit these settings ---------------------------------------------------- def setSize = 200 // number of diverse structures to generate def OVERLAP_FIELD = 'Overlap hits' // field name of the overlap analysis hits field // ---------- probably no need to edit anything below here --------------------------- def parent = dataTree.rootVertex.entity // root entity def fldId = parent.idField // ID field println "found ID field ${fldId.id}" // overlap field def fldOvrlp = parent.fields.items.find { it.name == OVERLAP_FIELD } println "found overlap hits field ${fldOvrlp.id}" // ResultSet and VertexStates def rs = parent.schema.dataProvider.getDefaultResultSet(dataTree, false, DFEnvironmentRO.DEV_NULL) def parentVS = rs.getVertexState(dataTree.rootVertex) def ids = parentVS.ids println "Found $ids.size parent IDs to analyse" def subset = new LinkedHashSet() def rand = new Random() rs.lockable.withLock('selecting diverse subset') { envRW -> def idx = 0 while (ids && subset.size() < setSize) { int pos = rand.nextInt(ids.size()) int id = ids[pos] print "testing $id [$pos] ... " ids.remove(pos) // remove from the list so we don't look for it again try { def data = parentVS.getData([id], DFEnvironmentRO.DEV_NULL) // read data for this ID def sims = data[id][fldOvrlp.id] // get the similarity report def found if (sims) { matcher = sims =~ /(\d+) \([\d\.]+\)/ found = matcher.find { subset.contains(Integer.parseInt(it[1])) } } if (found) { println "excluded" } else { println "added" subset.add(id) } } catch (Exception exc) { println "EROROR Failed to load ID $id ${exc.toString()}" } finally { idx++ } } } println "\nSubset selection complete. Found ${subset.size()} diverse entrees:\n\n" + subset.join('\n') + '\n'
Versions: This script has been tested on IJC versions 6.0