Create a Diverse Subset

/** Select a diverse subset of structures from amongst the current results.
* The algorithm is pretty simplistic, randomly selecting a structure,
* checking if it is similar to one already in the pool, and if not then adding
* to the pool.
* Similarity scores are obtained from the results of an overlap analysis that
* must previously have been run.
* The results are written out to the console and can be copied, and then pasted as a new list.
*
* Steps:
* 1. Run an overlap analysis using the same structure entity as query and target (Tools -> Chemistry -> Overlap analysis).
* Specify the similarity threshold you want to use to to exclude molecules.
* 2. Edit the parameters in the 'edit these settings' section
* 3. Run the script
*
* @author Tim Dudgeon ([email protected])
*/

import com.im.commons.progress.*

// ---------- edit these settings ----------------------------------------------------

def setSize = 200 // number of diverse structures to generate
def OVERLAP_FIELD = 'Overlap hits' // field name of the overlap analysis hits field

// ---------- probably no need to edit anything below here ---------------------------

def parent = dataTree.rootVertex.entity // root entity
def fldId = parent.idField // ID field
println "found ID field ${fldId.id}"
// overlap field
def fldOvrlp = parent.fields.items.find { it.name == OVERLAP_FIELD }
println "found overlap hits field ${fldOvrlp.id}"

// ResultSet and VertexStates
def rs = parent.schema.dataProvider.getDefaultResultSet(dataTree, false, DFEnvironmentRO.DEV_NULL)
def parentVS = rs.getVertexState(dataTree.rootVertex)
def ids = parentVS.ids
println "Found $ids.size parent IDs to analyse"




def subset = new LinkedHashSet()
def rand = new Random()
rs.lockable.withLock('selecting diverse subset') { envRW ->
def idx = 0
while (ids && subset.size() < setSize) {
int pos = rand.nextInt(ids.size())
int id = ids[pos]
print "testing $id [$pos] ... "
ids.remove(pos) // remove from the list so we don't look for it again

try {
def data = parentVS.getData([id], DFEnvironmentRO.DEV_NULL) // read data for this ID
def sims = data[id][fldOvrlp.id] // get the similarity report
def found
if (sims) {
matcher = sims =~ /(\d+) \([\d\.]+\)/
found = matcher.find { subset.contains(Integer.parseInt(it[1])) }
}
if (found) {
println "excluded"
} else {
println "added"
subset.add(id)
}
} catch (Exception exc) {
println "EROROR Failed to load ID $id ${exc.toString()}"
} finally {
idx++
}
}
}

println "\nSubset selection complete. Found ${subset.size()} diverse entrees:\n\n" + subset.join('\n') + '\n'

Versions: This script has been tested on IJC versions 6.0

Copyright © 1999-2012 ChemAxon Ltd.    All rights reserved.