Merge Data from a SDF

This script merges data from a SD file into an IJC entity, based on a common key value. If that particular value is not found, a new row containing information is inserted.

/** Merge data from a SD file into an IJC entity.
*
* Usage:
* 1. load the first SD file into IJC using the standard import(script is configured for file 'Building_blocks_GBP_fixed.sdf')
* 2. add a text field named MpBp
* 3. edit the variables in the 'edit these settings' section (script is configured for file '2004 Building Blocks Database.sdf')
* 4. execute the script
*
* Note: the files used for the pre-configured settings can be found here: http://shared.chemaxon.com/projects/ijc/data/
* Note: the script will run much faster if there is an index on the column for the merge field (casno in this case).
* Note: there is special handling of the values for the CASno field from the SD file because of quirks with the data
* from this field. If you are using this for your own data then you will need to remove this custom processing.
*
* @author Tim Dudgeon ([email protected])
*/

import chemaxon.formats.MolImporter
import chemaxon.struc.Molecule
import com.im.df.api.dml.*
import com.im.commons.progress.*
import com.im.df.api.support.*

// ---------- edit these settings ----------------------------------------------------
String file = 'C:/data/structures/KeyOrganics/2004 Building Blocks Database.sdf' // the file to load
String structureFieldName = 'Structure' // the name fo the structure field
String mergePropName = 'CASno' // the name of the merge field in the file
String mergeFieldName = 'casno' // the name of the merge property in the database
// next two are a list of maps. key is the name in the file and value is the corresponding name of the field in the database
Map insertValFieldNames = ['ID':'code', 'Purity':'purity'] // fields whose values are added only when inserting new row
Map updateAndInsertValFieldNames = ['Mp/Bp':'MpBp'] // fields whose values are updated or inserted
// ------------probably no need to edit anything below here ---------------------------


MolImporter importer = new MolImporter(file)
importer.grabbingEnabled = true
Molecule mol = new Molecule()

def parent = dataTree.rootVertex.entity
def edp = parent.schema.dataProvider.getEntityDataProvider(parent)
def structureField = parent.fields.items.find { it.name == structureFieldName }
def mergeField = parent.fields.items.find { it.name == mergeFieldName }

// find the fields - we need them later
Map insertFields = [:]
insertValFieldNames.each { k,v ->
def fld = parent.fields.items.find { it.name == v }
assert fld != null
insertFields[k] = fld
}

Map updateAndInsertFields = [:]
updateAndInsertValFieldNames.each { k,v ->
def fld = parent.fields.items.find { it.name == v }
assert fld != null
updateAndInsertFields[k] = fld
}

def lock = edp.lockable.obtainLock('updating data')
def env = EnvUtils.createDefaultEnvironmentRW(lock, 'updating data', true);

int i = 1
int updates = 0
int inserts = 0
try {
println "Reading file"
while (importer.read(mol)) {
String molStr = importer.grabbedMoleculeString
String mergeVal = MPropHandler.convertToString(mol.properties(), mergePropName);
// custom processing of CASno values
// strangely the cas numbers are surrounded by square brackets in this file so we need to remove them
if (mergeVal) {
mergeVal = mergeVal.substring(1, mergeVal.length() - 1)
}
Map insertVals = [:]
insertValFieldNames.each { k,v ->
insertVals[k] = mol.getProperty(k)
}

Map updateAndInsertVals = [:]
updateAndInsertValFieldNames.each { k,v ->
updateAndInsertVals[k] = mol.getProperty(k)
}

boolean update = false
DFTermExpression q = null
if (mergeVal) {

q = DFTermsFactory.createFieldOperatorValueExpr(
Operators.EQUALS, mergeField, null, mergeVal)

List ids = edp.queryForIds(q, SortDirective.EMPTY, env)
if (ids) {
update = true
updates++
} else {
inserts++
}
}
if (update) {
Map vals = [:]
// add the values
updateAndInsertFields.each { k,v ->
vals[v.id] = updateAndInsertVals[k]
}

DFUpdateDescription ud = DFUpdateDescription.create(parent, q, vals)
Map<DFUpdateDescription, DFUpdateResult> result = edp.update([ud], DFUndoConfig.OFF, env)
DFUpdateResult res = result[ud]
println "$i updated ${res.changedRowsCount} rows"
} else {
// new map and add structure
Map vals = [(structureField.id):molStr]
// add the values for fields that are for insert only
insertFields.each { k,v ->
vals[v.id] = insertVals[k]
}
// and add the ones that are for both insert and update
updateAndInsertFields.each { k,v ->
vals[v.id] = updateAndInsertVals[k]
}
edp.insert(vals, null, env)
println "$i inserted row"
}

i++
}
} finally {
importer.close()
env?.feedback.finish()
lock?.release()
}

println "Finished. $updates updates, $inserts inserts"

Versions: This script has been tested on IJC versions 5.6 and 5.7

Copyright © 1999-2012 ChemAxon Ltd.    All rights reserved.