Create a Diverse Subset

/** Select a diverse subset of structures from amongst the current results.
 * The algorithm is pretty simplistic, randomly selecting a structure,
 * checking if it is similar to one already in the pool, and if not then adding
 * to the pool.
 * Similarity scores are obtained from the results of an overlap analysis that
 * must previously have been run.
 * The results are written out to the console and can be copied, and then pasted as a new list.
 *
 * Steps:
 * 1. Run an overlap analysis using the same structure entity as query and target (Tools -> Chemistry -> Overlap analysis).
 *    Specify the similarity threshold you want to use to to exclude molecules.
 * 2. Edit the parameters in the 'edit these settings' section
 * 3. Run the script
 *
 * @author Tim Dudgeon ([email protected])
 */

import com.im.commons.progress.*

// ---------- edit these settings ----------------------------------------------------

def setSize = 200 // number of diverse structures to generate
def OVERLAP_FIELD = 'Overlap hits' // field name of the overlap analysis hits field

// ---------- probably no need to edit anything below here ---------------------------

def parent = dataTree.rootVertex.entity // root entity
def fldId = parent.idField // ID field
println "found ID field ${fldId.id}"
// overlap field
def fldOvrlp = parent.fields.items.find { it.name == OVERLAP_FIELD }
println "found overlap hits field ${fldOvrlp.id}"

// ResultSet and VertexStates
def rs = parent.schema.dataProvider.getDefaultResultSet(dataTree, false, DFEnvironmentRO.DEV_NULL)
def parentVS = rs.getVertexState(dataTree.rootVertex)
def ids = parentVS.ids
println "Found $ids.size parent IDs to analyse"




def subset = new LinkedHashSet()
def rand = new Random()
rs.lockable.withLock('selecting diverse subset') { envRW ->
    def idx = 0
    while (ids && subset.size() < setSize) {
        int pos = rand.nextInt(ids.size())
        int id = ids[pos]
        print "testing $id [$pos] ... "
        ids.remove(pos) // remove from the list so we don't look for it again

        try {
            def data = parentVS.getData([id], DFEnvironmentRO.DEV_NULL) // read data for this ID
            def sims = data[id][fldOvrlp.id]  // get the similarity report
            def found
            if (sims) {
                matcher = sims =~ /(\d+) \([\d\.]+\)/
                found = matcher.find { subset.contains(Integer.parseInt(it[1])) }
            }
            if (found) {
                println "excluded"
            } else {
                println "added"
                subset.add(id)
            }
        } catch (Exception exc) {
            println "EROROR Failed to load ID $id ${exc.toString()}"
        } finally {
            idx++
        }
    }
}

println "\nSubset selection complete. Found ${subset.size()} diverse entrees:\n\n" +  subset.join('\n') + '\n'

Versions: This script has been tested on IJC versions 6.0



Copyright © 1999-2012 ChemAxon Ltd.    All rights reserved.