Vladimír Višňovský · eb1b90dd
--- a/pipelineJupyter.ipynb

+ 1

− 0
+++ b/pipelineJupyter.ipynb

+ 1

− 0
 %% Cell type:markdown id: tags:

 ## 1. Definition of computational parameters

 %% Cell type:code id: tags:

 ``` python
 import os
 import sys
 import shutil
 import re
 import math
 import time
 import subprocess
 import nglview as nv
 import pytraj as pt

 #insert path to python scripts
 sys.path.insert(1, os.environ['BASE'] + "/modules")
 sys.path.insert(1, os.environ['BASE'] + "/modules/gmx_orca")

 from draw_2d import *
 from draw_3d import drawit
 from tqdm.notebook import tqdm

 from molvs import Standardizer

 from rdkit import Chem
 from rdkit.Chem import Draw
 from rdkit.Chem import SDWriter
 from rdkit.Chem import AllChem
 from rdkit.Chem import MolFromSmarts
 from rdkit.Chem import rdmolops
 from rdkit.Chem import rdchem

 from convert import convert_to_orca_methods
 from plot_graph import plot_landmarks
 from gmx_orca_run import gmx_run, orca_run, parallel_wait
 from replace import replace_text_for_embedding
 ```

 %% Cell type:code id: tags:

 ``` python
 #number of configurations to generate in basic energy minimization
 numc = 50

 #path to applications
 orca_job_check = "/home/base/modules/orcajobcheck.py"
 xyz_to_mol = "/home/base/modules/xyz2mol/xyz2mol.py"
 ```

 %% Cell type:markdown id: tags:

 ## 2. Molecule shape processing

 %% Cell type:code id: tags:

 ``` python
 #specify input of desired molecule in SMILES
 smiles_molecule = "CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)C[NH+]3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5"

 molecule = Chem.MolFromSmiles(smiles_molecule)

 render_svg(moltosvg(molecule))
 ```

 %% Cell type:code id: tags:

 ``` python
 with open("molekula.smi", "w") as smifile:
    smifile.write(smiles_molecule)

 s = Standardizer()
 molecule = s.standardize(molecule)
 molecule = Chem.AddHs(molecule)
 natoms = molecule.GetNumAtoms()
 charge = Chem.rdmolops.GetFormalCharge(molecule)

 render_svg(moltosvg(molecule))
 ```

 %% Cell type:code id: tags:

 ``` python
 #set lowest energy configuration

 #set default id of minimal energy conformer to -1
 minid = -1
 #set default minimal energy to highest possible float number
 minene = sys.float_info.max

 AllChem.EmbedMultipleConfs(molecule, clearConfs=True, numConfs=numc)
 #run 'MMFF94 force field' energy evaluation
 done = AllChem.MMFFOptimizeMoleculeConfs(molecule)

 for i in range(len(done)):
    if done[i][1]<minene:
        minene = done[i][1]
        minid = i
 print(f'Minimal energy: {minene}')

 writer = SDWriter("molekula.mol")
 writer.write(molecule, confId=minid)

 drawit(molecule)
 ```

 %% Cell type:code id: tags:

 ``` python
 #set pattern to detect torsion angles

 RotatableBond = Chem.MolFromSmarts('[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]')
 rotatables = molecule.GetSubstructMatches(RotatableBond)
 print(rotatables)
 ```

 %% Cell type:code id: tags:

 ``` python
 #get numbers of atoms that form torsion angles

 torsions = []
 for rotatable in rotatables:
    pairs1 = []
    pairs2 = []
    for bond in molecule.GetBonds():
        if rotatable[0]==bond.GetBeginAtomIdx() and rotatable[1]!=bond.GetEndAtomIdx():
            pairs1.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])
        if rotatable[1]==bond.GetBeginAtomIdx() and rotatable[0]!=bond.GetEndAtomIdx():
            pairs2.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])
    torsions.append([pairs1[0][1],pairs1[0][0],pairs2[0][0],pairs2[0][1]])
 print(torsions)
 ```

 %% Cell type:markdown id: tags:

 ## 3. Preparation of environment and molecule

 %% Cell type:code id: tags:

 ``` python
 # prepare input files for energetic minimization

 netcharge = rdmolops.GetFormalCharge(molecule)
 !antechamber -i molekula.mol -fi mdl -o molekula.prepi -fo prepi -c bcc -nc {netcharge} && \
 parmchk2 -i molekula.prepi -f prepi -o molekula.frcmod && \
 tleap -f tleapin.txt && \
 acpype -p molekula.prmtop -x molekula.inpcrd
 ```

 %% Cell type:code id: tags:

 ``` python
 #fix ordering of atoms

 order_before = []
 order_after = []

 with open('sqm.pdb','r') as pdbfile:
    for atom in pdbfile.readlines():
        order_before.append(atom.split()[2])

 with open('MOL_GMX.gro','r') as grofile:
    for atom in grofile.readlines():
        if atom.startswith('    1  MOL'):
            order_after.append(atom.split()[2])

 torsions_new = []
 torsion_new = []
 for torsion in torsions:
    for i in torsion:
        torsion_new.append(order_after.index(order_before[i])+1)
    torsions_new.append(torsion_new)
    torsion_new = []
 torsions = torsions_new
 print(torsions)
 ```

 %% Cell type:code id: tags:

 ``` python
 !mkdir -p em
 with open("em/em.mdp", "w") as emfile:
    emfile.write("integrator          =  steep\n")
    emfile.write("nsteps              =  100000\n")
    emfile.write("emtol               =  0\n")
    emfile.write("emstep              =  0.1\n")
    emfile.write("nstcomm             =  1\n")
    emfile.write("nstxout             =  100\n")
    emfile.write("nstvout             =  100\n")
    emfile.write("nstfout             =  0\n")
    emfile.write("nstlog              =  100\n")
    emfile.write("nstenergy           =  100\n")
    emfile.write("nstlist             =  1\n")
    emfile.write("ns_type             =  grid\n")
    emfile.write("coulombtype         =  cut-off\n")
    emfile.write("rlist               =  1.4\n")
    emfile.write("rcoulomb            =  1.4\n")
    emfile.write("rvdw                =  1.4\n")
    emfile.write("energygrps          =  System\n")
    emfile.write("epsilon-r           =  80\n")
    emfile.write("\n")
 shutil.copy("MOL_GMX.gro", "em/")
 shutil.copy("MOL_GMX.top", "em/")
 ```

 %% Cell type:code id: tags:

 ``` python
 gmx_run("editconf -f MOL_GMX -o box -c -box 3 3 3", workdir="em")
 gmx_run("grompp -f em.mdp -c box -p MOL_GMX -o em1", workdir="em")
 gmx_run("mdrun -deffnm em1 -ntomp 2", workdir="em")
 ```

 %% Cell type:code id: tags:

 ``` python
 #prepare files for getting the molecule to dynamic state

 !mkdir -p md
 with open("md/md.mdp", "w") as mdfile:
    mdfile.write("integrator          = sd\n")
    mdfile.write("nsteps              = 100000\n")
    mdfile.write("dt                  = 0.001\n")
    mdfile.write("nstxout             = 1000\n")
    mdfile.write("nstvout             = 1000\n")
    mdfile.write("nstenergy           = 1000\n")
    mdfile.write("nstlog              = 1000\n")
    mdfile.write("continuation        = no\n")
    mdfile.write("constraints         = none\n")
    mdfile.write("cutoff-scheme       = Verlet\n")
    mdfile.write("ns_type             = grid\n")
    mdfile.write("nstlist             = 1\n")
    mdfile.write("rlist               = 1.4\n")
    mdfile.write("rcoulomb            = 1.4\n")
    mdfile.write("rvdw                = 1.4\n")
    mdfile.write("coulombtype         = cut-off\n")
    mdfile.write("tcoupl              = V-rescale\n")
    mdfile.write("tc-grps             = system\n")
    mdfile.write("tau_t               = 0.1\n")
    mdfile.write("ref_t               = 300\n")
    mdfile.write("pcoupl              = no\n")
    mdfile.write("pbc                 = xyz\n")
    mdfile.write("gen_vel             = yes\n")
    mdfile.write("epsilon-r           = 80\n")
    mdfile.write("\n")
 shutil.copy("em/em1.gro", "md/")
 shutil.copy("MOL_GMX.top", "md/")
 ```

 %% Cell type:code id: tags:

 ``` python
 gmx_run("grompp -f md.mdp -c em1 -p MOL_GMX -o md1", workdir="md")
 gmx_run("mdrun -deffnm md1 -ntomp 2", workdir="md")
 ```

 %% Cell type:code id: tags:

 ``` python
 #show trajectory of conformations
 #select group for trjconv evaluation
 #Group     0 (         System)
 #Group     1 (          Other)
 #Group     2 (            MOL)
 group = "0"

 gmx_run("trjconv -s md1.tpr -f md1.trr -o outTraj.pdb", workdir="md", groups=group)

 traj = pt.load('md/outTraj.pdb')
 view = nv.show_pytraj(traj)
 view
 ```

 %% Cell type:code id: tags:

 ``` python
 #fix periodic boundaries errors when the molecule jumps out of the box
 #select group for trjconv evaluation
 #Group     0 (         System)
 #Group     1 (          Other)
 #Group     2 (            MOL)
 group = "1"

 for i in tqdm(range(len(torsions))):
    fr = str(float(100-len(torsions)+i)-0.01)
    to = str(float(100-len(torsions)+i)+0.01)
    gmx_run(f"trjconv -pbc nojump -s md1 -f md1 -o frame{i}.gro -b {fr} -e {to}<<EOF\n0\nEOF", workdir="md", groups=group)
 ```

 %% Cell type:markdown id: tags:

 ## 4. Generation of representative configurations
 ### 4.1 Trajectory generation

 %% Cell type:code id: tags:

 ``` python
 !mkdir -p mtd
 with open("mtd/mtd.mdp", "w") as mtdfile:
    mtdfile.write("integrator          = sd\n")
    mtdfile.write("nsteps              = 10000000\n") #10 ns each replica
    mtdfile.write("dt                  = 0.001\n")
    mtdfile.write("nstxout             = 10000\n") # output coordinates every 10ps to *.trr
    mtdfile.write("nstvout             = 10000\n") # output velocities every 10ps to *.trr
    mtdfile.write("nstenergy           = 1000\n")
    mtdfile.write("nstlog              = 1000\n")
    mtdfile.write("continuation        = no\n")
    mtdfile.write("constraints         = none\n")
    mtdfile.write("cutoff-scheme       = Verlet\n")
    mtdfile.write("ns_type             = grid\n")
    mtdfile.write("nstlist             = 1\n")
    mtdfile.write("rlist               = 1.4\n")
    mtdfile.write("rcoulomb            = 1.4\n")
    mtdfile.write("rvdw                = 1.4\n")
    mtdfile.write("coulombtype         = cut-off\n")
    mtdfile.write("tcoupl              = V-rescale\n")
    mtdfile.write("tc-grps             = system\n")
    mtdfile.write("tau_t               = 0.1\n")
    mtdfile.write("ref_t               = 300\n")
    mtdfile.write("pcoupl              = no\n")
    mtdfile.write("pbc                 = xyz\n")
    mtdfile.write("gen_vel             = yes\n")
    mtdfile.write("epsilon-r           = 80\n")
    mtdfile.write("\n")
 ```

 %% Cell type:code id: tags:

 ``` python
 for i in range(len(torsions)):
    if not os.path.exists("mtd/w{}".format(i)):
        !mkdir -p mtd/w{i}
    with open("mtd/w{}/plumed.dat".format(i), "w") as plumeddat:
        plumeddat.write("RANDOM_EXCHANGES\n")
        plumeddat.write("WHOLEMOLECULES ENTITY0=1-{}\n".format(natoms))
        for j in range(len(torsions)):
            plumeddat.write("TORSION ATOMS={},{},{},{} LABEL=cv{}\n".format(torsions[j][0],torsions[j][1],torsions[j][2],torsions[j][3],j+1))
        plumeddat.write("METAD ARG=cv{} HEIGHT=0.5 SIGMA=0.3 PACE=1000 GRID_MIN=-pi GRID_MAX=pi BIASFACTOR=15 LABEL=be\n".format(i+1))
        cvs = ""
        for j in range(len(torsions)):
            cvs=cvs+"cv{},".format(j+1)
        cvs = cvs[:-1]
        plumeddat.write("PRINT ARG={} STRIDE=1000 FILE=COLVAR\n".format(cvs))
        plumeddat.write("PRINT ARG=be.bias STRIDE=1000 FILE=BIAS\n")

 shutil.copy("MOL_GMX.top", "mtd/")
 ```

 %% Cell type:code id: tags:

 ``` python
 for i in tqdm(range(len(torsions))):
    shutil.copy("md/frame{}.gro".format(i), "mtd/w{}/".format(i))
    gmx_run(f"grompp -f mtd.mdp -c w{i}/frame{i} -p MOL_GMX -o w{i}/mtd1", workdir="mtd", parallel=True)
 parallel_wait()
 ```

 %% Cell type:code id: tags:

 ``` python
 directories = ""
 for i in range(len(torsions)):
    directories = directories + "w{} ".format(i)

+#see mdrunlog in mtd directory for insight to mdrun simulation (e.g 'tail -f mdrunlog')
 gmx_run(f"mdrun -g mdrunlog -ntomp 1 -deffnm mtd1 -replex 500 -plumed plumed.dat -multidir {directories}",
        workdir="mtd",
        mpi_run=len(torsions))
 ```

 %% Cell type:markdown id: tags:

 ### 4.2 Configurations clustering

 %% Cell type:code id: tags:

 ``` python
 #select groups for cluster evaluation
 #Group     0 (         System)
 #Group     1 (          Other)
 #Group     2 (            MOL)
 #Group     3 (         Custom)
 groups = "30"
 !mkdir -p -m 757 clustering ; mkdir -p -m 757 clustering/outClustersPDB ; mkdir -p -m 757 clustering/outClustersXYZ ; mkdir -p -m 757 clustering/outClusters

 trajectories = ""
 for i in range(len(torsions)):
    trajectories = trajectories + f"mtd/w{i}/mtd1.trr "

 #concatinate trajectories
 gmx_run(f"trjcat -f {trajectories} -cat -o mtd/mtd1.trr")
 #make index file with non-hydrogen atoms, use 1 & ! a H* and q for input
 gmx_run("make_ndx -f md/md1.tpr -o mtd/index.ndx", make_ndx="1&!aH*")
 #include index file and select index group 3
 gmx_run("cluster -method gromos -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -n mtd/index.ndx -cutoff 0.15 -cl clustering/outCluster.pdb",
        groups=groups)


 #divide all clusters from gmx cluster to single clusters
 with open("clustering/outCluster.pdb") as input_cluster:
    i = 0
    outFile = open("clustering/outClustersPDB/outCluster{}.pdb".format(i), "w")
    for line in input_cluster:
        if line != "ENDMDL\n":
            outFile.write(line)
            continue
        outFile.write("ENDMDL\n")
        i += 1
        outFile.close()
        outFile = open("clustering/outClustersPDB/outCluster{}.pdb".format(i), "w")
 !rm clustering/outClustersPDB/outCluster{i}.pdb
 clusters_ids = [x for x in range(0, i)]


 #convert .pdb clusters to .xyz
 for pdb_cluster in tqdm(os.listdir("clustering/outClustersPDB/")):
    !babel -ipdb clustering/outClustersPDB/{pdb_cluster} -oxyz clustering/outClustersXYZ/{pdb_cluster.replace("pdb", "xyz")}
 ```

 %% Cell type:markdown id: tags:

 ### 4.3 Visualize landmarks

 %% Cell type:code id: tags:

 ``` python
 #apply periodic boundary conditions to mtd trajectory -> fitting -> no Hydrogen -> train parmtSNEcv -> compute embeddings

 !mkdir -p visualization/traj

 #Group     0 (         System)
 #Group     1 (          Other)
 #Group     2 (            MOL)
 #Group     3 (         Custom)
 #select group for periodic boundaries check output:
 group = "0"
 gmx_run("trjconv -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -pbc mol -o visualization/traj/mtd1_nopbc.xtc", groups=group)
 #select groups for fitting and output:
 groups = "00"
 gmx_run("trjconv -f visualization/traj/mtd1_nopbc.xtc -s clustering/outClustersPDB/outCluster0.pdb -fit rot+trans -o visualization/traj/mtd1_fit.xtc",
        groups=groups)
 #select group for no Hydrogen output:
 group = "3"
 gmx_run("trjconv -f visualization/traj/mtd1_fit.xtc -n mtd/index.ndx -o visualization/traj/mtd1_fit_noH.xtc",
        groups=group)
 #select groups for size of box and output:
 groups = "33"
 gmx_run("editconf -f clustering/outClustersPDB/outCluster0.pdb -n mtd/index.ndx -box 3 3 3 -c -o visualization/ref.pdb",
        groups=groups)


 #train parmtSNEcv
 !parmtSNEcv -i visualization/traj/mtd1_fit_noH.xtc -p visualization/ref.pdb -boxx 3 -boxy 3 -boxz 3 -dim 2 -layers 2 -o visualization/out.txt -plumed visualization/plumed.dat -epochs 2000


 #modify plumed.dat to compute embedding in every step and change name of file for *** mtd trajectory ***
 replace_text_for_embedding("visualization/plumed.dat", stride=(100, 1), file_name=("COLVAR", "2d_embedding"))
 #fix weights of atoms
 replace_text_for_embedding("visualization/ref.pdb", weight=(0.00, 1.00))
 #run as plumed
 gmx_run("driver --plumed visualization/plumed.dat --mf_xtc visualization/traj/mtd1_fit.xtc")

 #modify plumed.dat to compute embedding in every step and change name of file for *** representatives ***
 replace_text_for_embedding("visualization/plumed.dat", file_name=("2d_embedding", "landmarks"))
 #run as plumed
 gmx_run("driver --plumed visualization/plumed.dat --mf_pdb clustering/outCluster.pdb")
 ```

 %% Cell type:code id: tags:

 ``` python
 #visualize

 x = []
 y = []
 x1 = []
 y1 = []
 with open("2d_embedding", "r") as ifile:
    for line in ifile.readlines()[1:]:
        split_values = line.split()
        x.append(split_values[1])
        y.append(split_values[2])
 with open("landmarks", "r") as ifile:
    for line in ifile.readlines()[1:]:
        split_values = line.split()
        x1.append(split_values[1])
        y1.append(split_values[2])

 plot_landmarks(x, y, x1, y1).show()
 ```

 %% Cell type:markdown id: tags:

 ## 5. Accurate and innacurate energy evaluation
 ### 5.1 AM1 method

 %% Cell type:code id: tags:

 ``` python
 #fix indexing
 #orca indexes from zero

 orca_torsions = [[] for _ in range(len(torsions))]
 torsion_index = 0
 for torsion in torsions:
    for atom_index in torsion:
        orca_torsions[torsion_index].append(atom_index - 1)
    torsion_index += 1
 ```

 %% Cell type:code id: tags:

 ``` python
 #1. convert
 #orca method description - first line in orca method
 method_description = "!AM1 Opt"
 input_dir = "clustering/outClustersXYZ/"
 output_dir = "am1/input/"
 spin = 1

 !mkdir -p -m 757 am1 ; mkdir -p -m 757 am1/input ; mkdir -p -m 757 am1/output
 !mkdir -p -m 757 orca_output ; mkdir -p -m 757 orca_output/am1 ; mkdir -p -m 757 orca_output/bp86svp ; mkdir -p -m 757 orca_output/bp86tzvp

 convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, -1, (charge, spin))
 ```

 %% Cell type:code id: tags:

 ``` python
 #2. compute
 #minimisation of conformations before quantum mechanics
 #see logs in orca_output directory...
 input_path = "am1/input/"
 output_path = "am1/output/"
 successfuly_converged = 0
 clusters_count = len(clusters_ids)

 for orca_method in tqdm(os.listdir(input_path)):
    if not orca_method.endswith(".inp"):
        continue
    orca_method_name = orca_method.replace(".inp", "")
    orca_method_number = orca_method_name[len("outCluster"):]
    output_dir_path = output_path + orca_method_name  + "/"
    if not os.path.exists(output_dir_path):
        !mkdir -p -m 757 {output_dir_path}
    shutil.copy(input_path + orca_method, output_dir_path)
    orca_run(orca_method, f"orca_output/am1/orca_output{orca_method_number}.out", workdir=output_dir_path, parallel=True)
 parallel_wait()
 ```

 %% Cell type:code id: tags:

 ``` python
 #3. check output
 #check output of orca methods

 for cluster_id in clusters_ids:
    with open(f"orca_output/am1/orca_output{cluster_id}.out") as iorca_output:
        iorca_output_text = iorca_output.read()
        if "****ORCA TERMINATED NORMALLY****" not in iorca_output_text:
            print(iorca_output_text)
            raise SystemExit(f"Error in am1 method of cluster no. {cluster_id}. You should not continue computation!")
 ```

 %% Cell type:markdown id: tags:

 ### 5.2 BP86 SVP method

 %% Cell type:code id: tags:

 ``` python
 #1. convert
 #orca method description - first line in orca method
 method_description = "!BP86 def2-SVP TightSCF Opt"
 output_dir = "bp86svp/input/"
 spin = 1
 required_cpus = 16

 #orca method description - first line in orca method
 method_description = "!BP86 def2-SVP TightSCF Opt"
 !mkdir -p -m 757 bp86svp ; mkdir -p -m 757 bp86svp/input ; mkdir -p -m 757 bp86svp/output

 for i in clusters_ids:
    input_dir = "am1/output/outCluster{}/".format(i)
    convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, required_cpus, (charge, spin))
 ```

 %% Cell type:code id: tags:

 ``` python
 #2. compute
 #minimisation of conformations before quantum mechanics
 #see logs in orca_output directory...
 input_path = "bp86svp/input/"
 output_path = "bp86svp/output/"
 successfuly_converged = 0
 clusters_count = len(clusters_ids)

 for orca_method in tqdm(os.listdir(input_path)):
    if not orca_method.endswith(".inp"):
        continue
    orca_method_name = orca_method.replace(".inp", "")
    orca_method_number = orca_method_name[len("outCluster"):]
    output_dir_path = output_path + orca_method_name  + "/"
    if not os.path.exists(output_dir_path):
        !mkdir -p -m 757 {output_dir_path}
    shutil.copy(input_path + orca_method, output_dir_path)
    orca_run(orca_method, f"orca_output/bp86svp/orca_output{orca_method_number}.out", workdir=output_dir_path, parallel=True)
 parallel_wait()
 ```

 %% Cell type:code id: tags:

 ``` python
 #3. check output && discard non converging clusters
 for cluster_id in clusters_ids:
    with open(f"orca_output/bp86svp/orca_output{cluster_id}.out") as iorca_output:
            iorca_output_text = iorca_output.read()
            if "****ORCA TERMINATED NORMALLY****" not in iorca_output_text:
                !{orca_job_check} orca_output/bp86svp/orca_output{cluster_id}.out
                clusters_ids.remove(int(orca_method_number))
            else:
                successfuly_converged += 1

 print(f"** {successfuly_converged}/{clusters_count} successfuly converged - unconverged are not considered in next steps **")
 ```

 %% Cell type:markdown id: tags:

 ### 5.3 BP86 TZVP method

 %% Cell type:code id: tags:

 ``` python
 #1. convert
 #orca method description - first line in orca method
 method_description = "!BP86 def2-TZVP TightSCF Opt"
 output_dir = "bp86tzvp/input/"
 spin = 1
 required_cpus = 16

 #orca method description - first line in orca method
 method_description = "!BP86 def2-TZVP TightSCF Opt"
 !mkdir -p -m 757 bp86tzvp ; mkdir -p -m 757 bp86tzvp/input ; mkdir -p -m 757 bp86tzvp/output

 for i in clusters_ids:
    input_dir = "bp86svp/output/outCluster{}/".format(i)
    convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, required_cpus, (charge, spin))
 ```

 %% Cell type:code id: tags:

 ``` python
 #2. compute
 #run quantum mechanics
 #see logs in orca_output directory...
 input_path = "bp86tzvp/input/"
 output_path = "bp86tzvp/output/"
 successfuly_converged = 0
 clusters_count = len(clusters_ids)

 for orca_method in tqdm(os.listdir(input_path)):
    if not orca_method.endswith(".inp"):
        continue
    orca_method_name = orca_method.replace(".inp", "")
    orca_method_number = orca_method_name[len("outCluster"):]
    output_dir_path = output_path + orca_method_name  + "/"
    if not os.path.exists(output_dir_path):
        !mkdir -p -m 757 {output_dir_path}
    shutil.copy(input_path + orca_method, output_dir_path)
    orca_run(orca_method, f"orca_output/bp86tzvp/orca_output{orca_method_number}.out", workdir=output_dir_path, parallel=True)
 parallel_wait()
 ```

 %% Cell type:code id: tags:

 ``` python
 #3. check output && discard non converging clusters
 for cluster_id in clusters_ids:
    with open(f"orca_output/bp86tzvp/orca_output{cluster_id}.out" as iorca_output:
        iorca_output_text = iorca_output.read()
        if "****ORCA TERMINATED NORMALLY****" not in iorca_output_text:
            !{orca_job_check} orca_output/bp86tzvp/orca_output{cluster_id}.out
            clusters_ids.remove(int(cluster_id))
        else:
            successfuly_converged += 1

 print(f"** {successfuly_converged}/{clusters_count} successfuly converged - unconverged are not considered in next steps **")
 ```

 %% Cell type:code id: tags:

 ``` python
 #extract final energies from output of orca
 with open("orca_energies.txt", "a") as ofile:
    for orca_output in os.listdir("orca_output/bp86tzvp"):
        with open("orca_output/bp86tzvp/" + orca_output) as iorca_output:
            for line in reversed(list(iorca_output)):
                energy_list = re.findall(r'(FINAL SINGLE POINT ENERGY)( +)(-?\d+\.\d+)', line)
                if len(energy_list) > 0:
                    ofile.write(energy_list[0][2])
                    ofile.write('\n')
                    break
 ```

 %% Cell type:code id: tags:

 ``` python
 #convert from hartree to kJ/mol and write quantum mechanics energies to file

 energies_in_hartree = []
 with open("orca_energies.txt", "r") as ifile:
    for line in ifile.readlines():
        energies_in_hartree.append(float(line))

 #constant to convert from hartree to kJ/mol
 CONVERSION_CONST = 2625.499638
 minimum = min(energies_in_hartree)

 energies_in_kJ = []
 for H in energies_in_hartree:
    energies_in_kJ.append((H-minimum)*CONVERSION_CONST)

 with open("orca_energies.txt", "w") as ofile:
    for energy in energies_in_kJ:
        ofile.write("{}\n".format(energy))
 ```

 %% Cell type:code id: tags:

 ``` python
 #copy atoms from conformation before QM and combine it with orca optimised conformations

 output_dir = "pdb_opt/"
 input_dir = "bp86tzvp/output/outCluster"

 !mkdir -p {output_dir}

 atoms = []
 with open("clustering/outClustersPDB/outCluster0.pdb", "r") as ifile:
    for line in ifile.readlines():
        if "ATOM" in line:
            atoms.append(line[:26])

 for i in clusters_ids:
    hetatms = []
    !babel -ixyz {input_dir}{i}/outCluster{i}.xyz -opdb {output_dir}temp_cluster_{i}.pdb
    with open("{}temp_cluster_{}.pdb".format(output_dir, i), "r") as ifile:
        for line in ifile.readlines():
            if "HETATM" in line:
                hetatms.append(line[27:66])
    with open("pdb_opt/cluster{}.pdb".format(i), "w") as output_cluster:
        for j in range(len(atoms)):
            output_cluster.write(atoms[j])
            output_cluster.write(hetatms[j] + "\n")

 !rm {output_dir}/temp_*
 ```

 %% Cell type:code id: tags:

 ``` python
 input_dir = "pdb_opt/"
 output_file = "clusters.pdb"

 #concatinate optimised conformations to trajectory
 with open(output_file, "w") as ofile:
    for i in clusters_ids:
        ofile.write("MODEL {}\n".format(str(i)))
        with open("{}cluster{}.pdb".format(input_dir, i), "r") as ifile:
            ofile.write(ifile.read())
            ofile.write("ENDMDL\n")

 with open("plumed.dat", "w") as ifile:
    cvs = []
    num_atoms = sum(1 for line in open("{}cluster{}.pdb".format(input_dir, clusters_ids[0])))
    ifile.write("WHOLEMOLECULES ENTITY0=1-{}\n".format(str(num_atoms)))
    for i in range(0, len(torsions)):
        cvs.append("cv{}".format(i))
        ifile.write("TORSION ATOMS=")
        ifile.write(",".join(str(x) for x in torsions[i]))
        ifile.write(" LABEL={}\n".format(cvs[i]))
    ifile.write("PRINT ARG=")
    ifile.write(",".join(cvs))
    ifile.write(" STRIDE=1 FILE=DIHEDRALS")

 #compute dihedrals
 gmx_run(f"driver --plumed plumed.dat --mf_pdb {output_file}")

 lines = []
 with open("DIHEDRALS", "r") as ifile:
    for line in ifile.readlines():
        if "#" not in line:
            lines.append(line)
 with open("DIHEDRALS", "w") as ofile:
    for line in lines:
        ofile.write(line)
 ```

 %% Cell type:code id: tags:

 ``` python
 #perform minimisations and compute GAFF energy

 cvs = [[] for _ in range(len(clusters_ids))]
 with open("DIHEDRALS","r") as ifile:
    dihedrals = ifile.readlines()
    for i in range(len(dihedrals)):
        t_angles = dihedrals[i].split()
        for j in range(len(torsions)):
            cvs[i].append(float(t_angles[j+1])*(180/math.pi))

 def generate_restraint(cluster_i):
    with open("MOL_GMX.top", "r") as ifile, open("gaff/cluster_{}/restrained.top".format(str(cluster_i)), "w") as ofile:
        for line in ifile.readlines():
            if line == "; Ligand position restraints\n":
                ofile.write("\n")
                ofile.write("[ dihedral_restraints ]\n")
                for j in range(len(torsions)):
                    ofile.write(" ".join(torsions[j]))
                    ofile.write("2 %3.1f 0 500\n" %cvs[i][j])
            ofile.write(line)
 !mkdir -p -m 757 gaff

 #select groups for energy evaluation
 groups = "10"

 for i in tqdm(clusters_ids):
    !mkdir -p -m 757 gaff/cluster_{i}
    shutil.copy("pdb_opt/cluster{}.pdb".format(i), "gaff/cluster_{}".format(i))
    shutil.copy("MOL_GMX.top", "gaff/cluster_{}".format(i))
    shutil.copy("em.mdp", "gaff/cluster_{}".format(i))
    shutil.copy("md.mdp", "gaff/cluster_{}".format(i))
    generate_restraint(i)
    gmx_run(f"editconf -f cluster{i}.pdb -box 3 3 3 -bt cubic -c -o box.gro", workdir=f"/gaff/cluster_{i}")
    gmx_run("grompp -f em -c box.gro -p restrained.top -o em1", workdir=f"/gaff/cluster_{i}")
    gmx_run("mdrun -ntomp 2 -s em1 -c after_em1 -g em1 -e em1 -o em1", workdir=f"/gaff/cluster_{i}")
    gmx_run("grompp -f md -c box.gro -p MOL_GMX.top -o rerun", workdir=f"/gaff/cluster_{i}")
    gmx_run("mdrun -ntomp 2 -s rerun -rerun em1 -c after_rerun -g rerun -e rerun -o rerun", workdir=f"/gaff/cluster_{i}")
    gmx_run("energy -f rerun.edr -o rerun.xvg", workdir=f"/gaff/cluster_{i}", groups=groups)
 ```

 %% Cell type:code id: tags:

 ``` python
 #extract energies and from each energy value subtract minimal energy

 energies_lst = []

 for i in clusters_ids:
    with open("gaff/cluster_{}/rerun.xvg".format(i, "r")) as ifile:
        last_line = ifile.readlines()[-1]
        energies = last_line.split(" ")
        energies_lst.append(energies[len(energies) - 1].rstrip())

 min_energy = min(energies_lst)
 with open("gaff_energies.txt", "w") as ofile:
    for energy in energies_lst:
        ofile.write("{} \n".format((float(energy) - float(min_energy))))
 ```

 %% Cell type:markdown id: tags:

 ## 6. Define correction of force field

 %% Cell type:code id: tags:

 ``` python
 #subtract from orca energies computed in step 5, GAFF energies computed in step 6

 with open("reference", "w") as ofile:
    with open("orca_energies.txt", "r") as orca_energies, open("gaff_energies.txt", "r") as gaff_energies:
        for orca_energy in orca_energies.readlines():
            for gaff_energy in gaff_energies.readlines():
                ofile.write("{} \n".format((float(orca_energy) - float(gaff_energy))))
 ```

 %% Cell type:code id: tags:

 ``` python
 #write final corrections into output file along with representative conformations

 input_dir = "pdb_opt/"

 with open("reference.pdb", "w") as ofile, open("reference", "r") as ifile:
    final_energies = ifile.readlines()
    energy_index = 0
    for i in clusters_ids:
        ofile.write("REMARK X={}".format(final_energies[energy_index]))
        with open("{}cluster{}.pdb".format(input_dir, i), "r") as ifile1:
            ofile.writelines(ifile1.readlines())
        energy_index += 1
 ```

 %% Cell type:code id: tags:

 ``` python
 x1 = []
 y1 = []

 with open("DIHEDRALS") as ifile:
    for line in ifile.readlines():
        split_values = line.split()
        x1.append(split_values[1])
        y1.append(split_values[2])

 plot_landmarks(x, y, x1, y1).show()
 ```