Merge Data from a SDF

This script merges data from a SD file into an IJC entity, based on a common key value. If that particular value is not found, a new row containing information is inserted.


/** Merge data from a SD file into an IJC entity.
 *
 * Usage:
 * 1. load the first SD file into IJC using the standard import(script is configured for file 'Building_blocks_GBP_fixed.sdf')
 * 2. add a text field named MpBp
 * 3. edit the variables in the 'edit these settings' section (script is configured for file '2004 Building Blocks Database.sdf')
 * 4. execute the script
 *
 * Note: the files used for the pre-configured settings can be found here: http://shared.chemaxon.com/projects/ijc/data/
 * Note: the script will run much faster if there is an index on the column for the merge field (casno in this case).
 * Note: there is special handling of the values for the CASno field from the SD file because of quirks with the data
 * from this field. If you are using this for your own data then you will need to remove this custom processing.
 *
 * @author Tim Dudgeon ([email protected])
 */

import chemaxon.formats.MolImporter
import chemaxon.struc.Molecule
import com.im.df.api.dml.*
import com.im.commons.progress.*
import com.im.df.api.support.*

// ---------- edit these settings ----------------------------------------------------
String file = 'C:/data/structures/KeyOrganics/2004 Building Blocks Database.sdf' // the file to load
String structureFieldName = 'Structure' // the name fo the structure field
String mergePropName = 'CASno' // the name of the merge field in the file
String mergeFieldName = 'casno' // the name of the merge property in the database
// next two are a list of maps. key is the name in the file and value is the corresponding name of the field in the database
Map insertValFieldNames = ['ID':'code', 'Purity':'purity'] // fields whose values are added only when inserting new row
Map updateAndInsertValFieldNames = ['Mp/Bp':'MpBp'] // fields whose values are updated or inserted
// ------------probably no need to edit anything below here ---------------------------


MolImporter importer = new MolImporter(file)
importer.grabbingEnabled = true
Molecule mol = new Molecule()

def parent = dataTree.rootVertex.entity
def edp = parent.schema.dataProvider.getEntityDataProvider(parent)
def structureField = parent.fields.items.find { it.name == structureFieldName }
def mergeField = parent.fields.items.find { it.name == mergeFieldName }

// find the fields - we need them later
Map insertFields = [:]
insertValFieldNames.each { k,v ->
    def fld = parent.fields.items.find { it.name == v }
    assert fld != null
    insertFields[k] = fld
}

Map updateAndInsertFields = [:]
updateAndInsertValFieldNames.each { k,v ->
    def fld = parent.fields.items.find { it.name == v }
    assert fld != null
    updateAndInsertFields[k] = fld
}

def lock = edp.lockable.obtainLock('updating data')
def env = EnvUtils.createDefaultEnvironmentRW(lock, 'updating data', true);

int i = 1
int updates = 0
int inserts = 0
try {
    println "Reading file"
    while (importer.read(mol)) {
        String molStr = importer.grabbedMoleculeString
        String mergeVal = MPropHandler.convertToString(mol.properties(), mergePropName);
        // custom processing of CASno values
        // strangely the cas numbers are surrounded by square brackets in this file so we need to remove them
        if (mergeVal) {
            mergeVal = mergeVal.substring(1, mergeVal.length() - 1)
        }
        Map insertVals = [:]
        insertValFieldNames.each { k,v ->
            insertVals[k] =  mol.getProperty(k)
        }

        Map updateAndInsertVals = [:]
        updateAndInsertValFieldNames.each { k,v ->
            updateAndInsertVals[k] =  mol.getProperty(k)
        }

        boolean update = false
        DFTermExpression q = null
        if (mergeVal) {

            q = DFTermsFactory.createFieldOperatorValueExpr(
                    Operators.EQUALS, mergeField, null, mergeVal)

            List ids = edp.queryForIds(q, SortDirective.EMPTY, env)
            if (ids) {
                update = true
                updates++
            } else {
                inserts++
            }
        }
        if (update) {
            Map vals = [:]
            // add the values
            updateAndInsertFields.each { k,v ->
                vals[v.id] = updateAndInsertVals[k]
            }

            DFUpdateDescription ud = DFUpdateDescription.create(parent, q, vals)
            Map<DFUpdateDescription, DFUpdateResult> result = edp.update([ud], DFUndoConfig.OFF, env)
            DFUpdateResult res = result[ud]
            println "$i updated ${res.changedRowsCount} rows"
        } else {
            // new map and add structure
            Map vals = [(structureField.id):molStr]
            // add the values for fields that are for insert only
            insertFields.each { k,v ->
                vals[v.id] = insertVals[k]
            }
            // and add the ones that are for both insert and update
            updateAndInsertFields.each { k,v ->
                vals[v.id] = updateAndInsertVals[k]
            }
            edp.insert(vals, null, env)
            println "$i inserted row"
        }

        i++
    }
} finally {
    importer.close()
    env?.feedback.finish()
    lock?.release()
}

println "Finished. $updates updates, $inserts inserts"


Versions: This script has been tested on IJC versions 5.6 and 5.7



Copyright © 1999-2012 ChemAxon Ltd.    All rights reserved.