diff --git a/Dockerfile b/Dockerfile index 83bc9a267e6fea15edba1f58293a2d2f568a8fde..38abafd576f0cd931260c22f507d2f44320c8090 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,39 +1,51 @@ -FROM ubuntu:18.04 +FROM continuumio/miniconda3 as build USER root ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Europe/Prague -COPY --from=spectraes/pmcv-pipeline-python:2021-04-19 /opt/intelpython3 /opt/intelpython3 -COPY --from=lachlanevenson/k8s-kubectl:v1.20.2 /usr/local/bin/kubectl /usr/local/bin/kubectl +WORKDIR /tmp +COPY environment.yml . +RUN conda env create -f environment.yml -RUN bash -c "source /opt/intelpython3/bin/activate && jupyter-nbextension enable nglview --py --sys-prefix" -RUN bash -c "apt-get update && apt-get install -y libxrender1 libgfortran3 git sudo jq apt-transport-https gnupg2 curl xz-utils" -#install parmtSNE -RUN bash -c "cd /opt && git clone https://github.com/spiwokv/parmtSNEcv.git" -RUN bash -c "source /opt/intelpython3/bin/activate && pip install 'ruamel.yaml<=0.15.94' && cd /opt/parmtSNEcv && pip install . && pip install --ignore-installed six tensorflow" +FROM ubuntu:20.04 -#install other tools -ARG distribution=ubuntu18.04 -RUN bash -c "curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -" -RUN bash -c "echo 'deb [arch=amd64] https://download.docker.com/linux/ubuntu bionic stable' >>/etc/apt/sources.list" -RUN bash -c "curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -" -RUN bash -c "curl -s -L -o /etc/apt/sources.list.d/nvidia-docker.list https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" -RUN bash -c "apt update && apt install -y docker-ce-cli nvidia-container-toolkit" +USER root + +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Prague + +RUN apt-get update && apt-get install -y \ + python3-distutils \ + python3-rdkit \ + librdkit1 \ + rdkit-data \ + curl \ + sudo + +RUN curl -fsSLo /usr/share/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg +RUN bash -c 'echo "deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" >/etc/apt/sources.list.d/kubernetes.list' +RUN apt update +RUN apt install -y kubectl + +COPY --from=build /opt/conda/envs/pyenv /opt/conda/envs/pyenv +#COPY --from=build /opt/conda /opt/conda #copy all necessary files to run PMCV force field correction pipeline COPY modules /home/base/modules/ COPY tleapin.txt /work/ +COPY init.sh /opt/ +RUN bash -c "/opt/init.sh" -RUN bash -c "useradd --uid 1001 --create-home --shell /bin/bash magic_user" -RUN bash -c "chmod a+rwx /home/base/modules/gmx_orca/lock.pkl" -RUN bash -c "chmod -R a+rX /opt /home" +ENV PATH="$PATH:/opt/conda/envs/pyenv/bin" +ENV PYTHONPATH=/home/base +ENV HOME=/work WORKDIR /work EXPOSE 8888 -#run Jupyter Notebook when container is executed -CMD bash -c "sleep 2 && curl -LO https://gitlab.ics.muni.cz/467814/magicforcefield-pipeline/-/raw/kubernetes/pipelineJupyter.ipynb && source /opt/intelpython3/bin/activate && jupyter notebook --ip 0.0.0.0 --allow-root --port 8888" +CMD curl -LO https://gitlab.ics.muni.cz/467814/magicforcefield-pipeline/-/raw/master/pipelineJupyter.ipynb && \ + jupyter notebook --ip 0.0.0.0 --allow-root --port 8888 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..eb912b548d9c7ae550b4cd227a57d4455937230b --- /dev/null +++ b/environment.yml @@ -0,0 +1,18 @@ +name: pyenv +channels: + - conda-forge +dependencies: + - openbabel + - jupyter + - tqdm + - openbabel + - jupyter + - tqdm + - molvs + - nglview + - matplotlib + - py3dmol + - ruamel.yaml + - ambertools=21 + - acpype + - plotly diff --git a/init.sh b/init.sh new file mode 100644 index 0000000000000000000000000000000000000000..da480a4680684375952c7a5239941f651f7ae97f --- /dev/null +++ b/init.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# add production user and permissions +useradd --uid 1001 --create-home --shell /bin/bash magic_user +chmod a+rwx /home/base/modules/gmx_orca/lock.pkl +chmod -R a+rX /opt /home + +# create all directories used in computations +cd /work +mkdir -p em +mkdir -p md +mkdir -p mtd +mkdir -p pdb_opt +mkdir -p visualization/traj +mkdir -p -m 757 gaff +mkdir -p -m 757 clustering/{outClustersPDB,outClustersXYZ,outClusters} +mkdir -p -m 757 am1/{input,output} +mkdir -p -m 757 bp86svp/{input,output} +mkdir -p -m 757 orca_output/{am1,bp86svp,bp86tzvp} +mkdir -p -m 757 bp86tzvp/{input,output} diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/gmx_orca/gmx_orca_run.py b/modules/gmx_orca/gmx_orca_run.py deleted file mode 100755 index fa6f5c697f18e869fba7267e34c575766b0f0447..0000000000000000000000000000000000000000 --- a/modules/gmx_orca/gmx_orca_run.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python -import ruamel_yaml -from ruamel_yaml.scalarstring import DoubleQuotedScalarString -import subprocess -import time -import os -import sys -import pickle - -# Set image to be used for gromacs calculations -GMX_IMAGE = "ljocha/gromacs:2021-1" -# Set image to be used for orca calculations -ORCA_IMAGE = "spectraes/pipeline_orca:latest" -# Set default filepaths -KUBERNETES_WAIT_PATH = PICKLE_PATH = os.path.dirname(os.path.realpath(__file__)) - - -def gmx_run(gmx_command, **kwargs): - """ - Convert gmx command into yaml file which is then run by kubernetes - - :param str gmx_command: gromacs command - :kwargs int mpi_run: request number of cpus for mpi_run - :kwargs str groups: specify interactive groups for gromacs - :kwargs str make_ndx: specify interactive make_ndx input for gromacs make_ndx - :kwargs str image: specify used image - :kwargs str workdir: specify directory where should the calculation take place - :kwargs str arch: specify architecture - :kwargs bool double: double precision for mdrun - :kwargs bool rdtscp: enable rndscp - :kwargs bool parallel: run jobs as parallel - """ - - mpi_run = kwargs.get('mpi_run', None) - groups = kwargs.get('groups', None) - make_ndx = kwargs.get('make_ndx', None) - image = kwargs.get('image', None) - workdir = kwargs.get('workdir', None) - double = kwargs.get('double', False) - rdtscp = kwargs.get('rdtscp', False) - arch = kwargs.get('arch', '') - parallel = kwargs.get('parallel', False) - - gmx_method = gmx_command.split()[0] - application = "gmx" - if gmx_method == "driver": - application = "plumed" - elif gmx_method == "mdrun": - application = "" - - gmx = "{} {}".format(application, gmx_command) - - if mpi_run: - gmx = "mpirun -np {} {}".format(mpi_run, gmx) - - if groups: - gmx = ") | {}".format(gmx) - for i in range(len(groups)): - # Write in reverse order because you insert from right to left - gmx = "echo {} {}".format(groups[::-1][i], gmx) - if i == (len(groups) - 1): - gmx = "({}".format(gmx) - break - gmx = "; sleep 1; {}".format(gmx) - - if make_ndx: - gmx = "| {}".format(gmx) - gmx = "(echo \'{}\'; sleep 1; echo q) {}".format(make_ndx, gmx) - - kubernetes_config, label = write_template(gmx_method, image, gmx, workdir, parallel, double=double, rdtscp=rdtscp, - arch=arch) - print(run_job(kubernetes_config, label, parallel)) - - -def orca_run(orca_method, log, **kwargs): - """ - Convert orca command into yaml file which is then run by kubernetes - - :param str orca_command: orca method used for computation - :param str log: log file to store output of computation - :kwargs str image: specify used image - :kwargs str workdir: specify directory where should the calculation take place - """ - image = kwargs.get('image', None) - workdir = kwargs.get('workdir', None) - parallel = kwargs.get('parallel', False) - - log = f"/tmp/{log}" - application = "orca" - orca = "/opt/orca/{} {} > {}".format(application, orca_method, log) - - kubernetes_config, label = write_template(application, image, orca, workdir,parallel, orca_method_file=f"{workdir}/{orca_method}") - print(run_job(kubernetes_config, label, parallel)) - - -def parallel_wait(): - """ - Wait for all currently running parallel tasks to finish - - :return: Nothing - """ - with open(f"{PICKLE_PATH}/lock.pkl","rb") as fp: - lock_object = pickle.load(fp) - label = lock_object['Parallel_label'] - count = lock_object['Count'] - if len(label) == 0 or count <= 0: - print(f"Nothing to wait for with label => {label}", file=sys.stderr) - return 1 - - # reset pickle - with open(f"{PICKLE_PATH}/lock.pkl","wb") as fp: - values = {"Parallel_label": "", "Count": 0} - pickle.dump(values, fp) - - print(run_wait(f"-l {label} -c {count}")) - - -def write_template(method, image, command, workdir, parallel, **kwargs): - with open(f"{os.path.dirname(os.path.realpath(__file__))}/kubernetes-template.yaml") as ifile: - doc = ruamel_yaml.round_trip_load(ifile, preserve_quotes=True) - - double = kwargs.get('double', 'OFF') - rdtscp = kwargs.get('rdtscp', 'OFF') - orca_method_file = kwargs.get('orca_method_file', '') - arch = kwargs.get('arch', '') - - # Set default values - default_image = GMX_IMAGE - default_name = "gromacs" - if method == "orca": - default_image = ORCA_IMAGE - default_name = "orca" - - # Always replace "" with "-" because "" is not kubernetes accepted char in the name - method = method.replace("_", "-") - - # Set names - timestamp = str(time.time()).replace(".", "") - identificator = "{}-{}-rdtscp-{}".format(default_name, method, timestamp) - doc['metadata']['name'] = identificator - doc['spec']['template']['spec']['containers'][0]['name'] = "{}-{}-deployment-{}".format(default_name, method, timestamp) - doc['spec']['template']['metadata']['labels']['app'] = identificator - - # Set gromacs args - doc['spec']['template']['spec']['containers'][0]['args'] = ["/bin/bash", "-c", DoubleQuotedScalarString(command)] - - # If not orca, set options for gmx container - if method != "orca": - double_env = {'name': "GMX_DOUBLE", 'value': DoubleQuotedScalarString("ON" if double else "OFF")} - rdtscp_env = {'name': "GMX_RDTSCP", 'value': DoubleQuotedScalarString("ON" if rdtscp else "OFF")} - arch_env = {'name': "GMX_ARCH", 'value': DoubleQuotedScalarString(arch)} - doc['spec']['template']['spec']['containers'][0]['env'] = [double_env, rdtscp_env, arch_env] - - # If parallel is enabled set label so kubectl logs can print logs according to label - if parallel: - with open(f"{PICKLE_PATH}/lock.pkl","rb") as fp: - lock_object = pickle.load(fp) - if len(lock_object['Parallel_label']) == 0: - label = {"Parallel_label": identificator, "Count": 0} - with open(f"{PICKLE_PATH}/lock.pkl","wb") as fp: - pickle.dump(label, fp) - else: - doc['spec']['template']['metadata']['labels']['app'] = lock_object['Parallel_label'] - - - # Set image - doc['spec']['template']['spec']['containers'][0]['image'] = default_image if not image else image - - # Set working directory - doc['spec']['template']['spec']['containers'][0]['workingDir'] = "/tmp/" - if workdir: - doc['spec']['template']['spec']['containers'][0]['workingDir'] += workdir - - # set PVC - pvc_name = os.environ['PVC_NAME'] - if len(pvc_name) == 0: - raise Exception("Error setting pvc_name, probably problem in setting env variable of actual container") - doc['spec']['template']['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_name - - # Set orca required cpus - if method == "orca": - no_of_procs = get_no_of_procs(orca_method_file) - if no_of_procs != -1: - doc['spec']['template']['spec']['containers'][0]['resources']['requests']['cpu'] = no_of_procs - - # Write to file - ofile_name = "{}-{}-rdtscp.yaml".format(default_name, method) - with open(ofile_name, "w") as ofile: - ruamel_yaml.round_trip_dump(doc, ofile, explicit_start=True) - - return ofile_name, identificator - - -def run_job(kubernetes_config, label, parallel): - os.system(f"kubectl apply -f {kubernetes_config}") - - if not parallel: - return run_wait(f"-l {label} -c 1") - - # increment pickle count - with open(f"{PICKLE_PATH}/lock.pkl","rb") as fp: - lock_object = pickle.load(fp) - with open(f"{PICKLE_PATH}/lock.pkl","wb") as fp: - lock_object['Count'] += 1 - pickle.dump(lock_object, fp) - - -def get_no_of_procs(orca_method_file): - with open(orca_method_file) as ifile: - for line in ifile.readlines(): - if "nprocs" in line: - return int(line.split()[1]) - return -1 - - -def run_wait(command): - cmd = f"{KUBERNETES_WAIT_PATH}/kubernetes-wait.sh {command}" - process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - - return process.communicate()[0].decode('utf-8', 'ignore') - diff --git a/modules/k8s/README.md b/modules/k8s/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb900422937b473779d0518514dced276d7a3a17 --- /dev/null +++ b/modules/k8s/README.md @@ -0,0 +1,86 @@ +# Gromacs and Orca Kubernetes wrapper +Every minor software package needed is contained in the main pipeline image. [Gromacs](https://www.gromacs.org/) and [Orca](https://sites.google.com/site/orcainputlibrary/home) are considered too large to be contained in the main container. Therefore these packages are divided into another two independent images. + +This implies there is a need of connecting the main pipeline container with Gromacs and Orca containers. A wrapper has been written exactly for this purpose. The wrapper is written in *Python* and converts the Gromacs/Orca command along with its arguments to a .yaml (Kubernetes job) configuration file. Based on this file the Kubernetes job is spawned where all computations happen. + +# **Gromacs** +## gmx_run() +***Mandatory argument:*** + +``` +string gmx_command - gromacs command to be evaluated +``` + +***Optional arguments:*** + +``` +int mpi_run - request number of cpus for mpi_run software +string groups - specify interactive groups for gromacs (1) +string make_ndx - specify interactive make_ndx input for gromacs make_ndx (2) +string image - specify used gromacs image +string workdir - specify directory where should the calculation take place +string arch - specify CPU architecture +bool double - double precision for mdrun (3) +bool rdtscp - enable rdtscp +bool parallel - run jobs in parallel (independently). Disabled by default +``` + +1. https://manual.gromacs.org/documentation/2019-rc1/reference-manual/analysis/using-groups.html +2. https://manual.gromacs.org/archive/5.0.4/programs/gmx-make_ndx.html +3. https://manual.gromacs.org/current/onlinehelp/gmx-mdrun.html + +--- + +***Example usages:*** + +``` +gmx_run('mdrun -deffnm em1 -ntomp 2') +gmx_run('mdrun -deffnm mtd1 -replex 500 -plumed plumed.dat', mpi_run=6') +gmx_run('mdrun -deffnm em1', image='user/gromacs:123') +gmx_run('mdrun -deffnm em1 -ntomp 2', arch='AVX256') +gmx_run('mdrun -deffnm em1 -ntomp 2', workdir='/home', double=True, arch='AVX256') +``` + + +# ***Orca*** +## orca_run() +***Mandatory arguments:*** +``` +string orca_command - orca method (file) used for computation (1) +string log - log file to store the output of computation +``` + +***Optional arguments:*** +``` +string image - specify used Orca image +string workdir - specify directory where should the calculation take place +bool parallel - run jobs in parallel (independently). Disabled by default +``` + +1. https://sites.google.com/site/orcainputlibrary/generalinput + +--- + +***Example usages:*** + +``` +orca_run('method1.inp', 'output1.out", workdir='/home', parallel=True) +``` + + +# ***Gromacs/Orca*** +## parallel_wait() +This method has to be placed after a `gmx_run()` or `orca_run()` with *parallel* enabled. It is a wait mechanism to detect if all jobs running in parallel finished. Afterwards logs of these jobs are printed. + +``` +orca_run('method1.inp', 'output1.out", workdir='/home', parallel=True) +orca_run('method2.inp', 'output2.out", workdir='/home', parallel=True) + +parallel_wait() + +--- + +for i in range 6: + orca_run(f'method{i}.inp', f'output{i}.out", workdir='/home', parallel=True) +parallel_wait() +``` diff --git a/modules/k8s/__init__.py b/modules/k8s/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/k8s/config.py b/modules/k8s/config.py new file mode 100644 index 0000000000000000000000000000000000000000..8ce3113271bd5db07762fcafd8242cd24d2e4635 --- /dev/null +++ b/modules/k8s/config.py @@ -0,0 +1,11 @@ +import os + +class Config: + + # set images to be used in calculations + GMX_IMAGE = "ljocha/gromacs:2021-1" + ORCA_IMAGE = "spectraes/pipeline_orca:latest" + PARMTSNECV_IMAGE = "spectraes/parmtsnecv:28-08-2021" + + # set filepaths relative to this config file + KUBERNETES_WAIT_PATH = PICKLE_PATH = os.path.dirname(os.path.realpath(__file__)) \ No newline at end of file diff --git a/modules/k8s/k8s_run.py b/modules/k8s/k8s_run.py new file mode 100755 index 0000000000000000000000000000000000000000..6aecd523e3d08aa59633feb72989b819f0171d1f --- /dev/null +++ b/modules/k8s/k8s_run.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +import ruamel.yaml +from ruamel.yaml.scalarstring import DoubleQuotedScalarString +import subprocess +import time +import os +import sys +import pickle + +import modules.k8s.k8s_utils as k8s_utils +from modules.k8s.config import Config + + +def gmx_run(gmx_command, **kwargs): + """ + Converts gmx command into yaml file which is then run by kubernetes + + :param str gmx_command: gromacs command + :kwargs int mpi_run: request number of cpus for mpi_run + :kwargs str groups: specify interactive groups for gromacs + :kwargs str make_ndx: specify interactive make_ndx input for gromacs make_ndx + :kwargs str image: specify used image + :kwargs str workdir: specify directory where should the calculation take place + :kwargs str arch: specify architecture + :kwargs bool double: double precision for mdrun + :kwargs bool rdtscp: enable rndscp + :kwargs bool parallel: run jobs as parallel + """ + + params = { + "mpi_run": kwargs.get('mpi_run', None), + "groups": kwargs.get('groups', None), + "make_ndx": kwargs.get('make_ndx', None), + "image": kwargs.get('image', None), + "workdir": kwargs.get('workdir', None), + "double": kwargs.get('double', None), + "rdtscp": kwargs.get('rdtscp', None), + "arch": kwargs.get('arch', ""), + "parallel": kwargs.get('parallel', None) + } + + gmx_method = gmx_command.split()[0] + application = "gmx" + if gmx_method == "driver": + application = "plumed" + elif gmx_method == "mdrun": + application = "" + + gmx = "{} {}".format(application, gmx_command) + + if params["mpi_run"]: + gmx = "mpirun -np {} {}".format(params["mpi_run"], gmx) + + if params["groups"]: + gmx = ") | {}".format(gmx) + for i in range(len(params["groups"])): + gmx = "echo {} {}".format(params["groups"][::-1][i], gmx) + if i == (len(params["groups"]) - 1): + gmx = "({}".format(gmx) + break + gmx = "; sleep 1; {}".format(gmx) + + if params["make_ndx"]: + gmx = "| {}".format(gmx) + gmx = "(echo \'{}\'; sleep 1; echo q) {}".format(params["make_ndx"], gmx) + + kubernetes_config, label = k8s_utils.write_template(gmx_method, gmx, params) + print(k8s_utils.run_job(kubernetes_config, label, params["parallel"])) + + +def orca_run(orca_method, log, **kwargs): + """ + Convert orca command into yaml file which is then run by kubernetes + + :param str orca_command: orca method used for computation + :param str log: log file to store output of computation + :kwargs str image: specify used image + :kwargs str workdir: specify directory where should the calculation take place + """ + + params = { + "image": kwargs.get('image', None), + "workdir": kwargs.get('workdir', None), + "parallel": kwargs.get('parallel', None) + } + + log = f"/tmp/{log}" + application = "orca" + orca = "/opt/orca/{} {} > {}".format(application, orca_method, log) + method_path = "{}/{}".format(params['workdir'], orca_method) + + kubernetes_config, label = k8s_utils.write_template(application, orca, params, orca_method_file=method_path) + print(k8s_utils.run_job(kubernetes_config, label, params["parallel"])) + + +def parmtsnecv_run(command, **kwargs): + ''' + Convert parmtSNEcv command into yaml file which is then run by kubernetes + + :kwargs str image: specify used image + :kwargs str workdir: specify directory where should the calculation take place + ''' + + params = { + "image": kwargs.get('image', None), + "workdir": kwargs.get('workdir', None), + "parallel": None + } + + application = 'parmtsnecv' + kubernetes_config, label = k8s_utils.write_template(application, command, params) + print(k8s_utils.run_job(kubernetes_config, label, None)) + + +def parallel_wait(): + """ + Wait for all currently running parallel tasks to finish + + :return: Nothing + """ + with open(f"{Config.PICKLE_PATH}/lock.pkl","rb") as fp: + lock_object = pickle.load(fp) + label = lock_object['Parallel_label'] + count = lock_object['Count'] + if len(label) == 0 or count <= 0: + print(f"Nothing to wait for with label => {label}", file=sys.stderr) + return 1 + + # reset pickle + with open(f"{Config.PICKLE_PATH}/lock.pkl","wb") as fp: + values = {"Parallel_label": "", "Count": 0} + pickle.dump(values, fp) + + print(k8s_utils.run_wait(f"-l {label} -c {count}")) + diff --git a/modules/k8s/k8s_utils.py b/modules/k8s/k8s_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..024b5e7519207fd2942d132a0f3e2d6cda336083 --- /dev/null +++ b/modules/k8s/k8s_utils.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +import ruamel.yaml +from ruamel.yaml.scalarstring import DoubleQuotedScalarString +import subprocess +import time +import os +import sys +import pickle + +from modules.k8s.config import Config + + +def write_template(method, command, params, **kwargs): + with open(f"{os.path.dirname(os.path.realpath(__file__))}/kubernetes-template.yaml") as ifile: + doc = ruamel.yaml.round_trip_load(ifile, preserve_quotes=True) + + orca_method_file = kwargs.get('orca_method_file', '') + timestamp = str(time.time()).replace(".", "") + # Always replace "" with "-" because "" is not kubernetes accepted char in the name + method = method.replace("_", "-") + # Set default values + default_image = '' + default_name = '' + + + if method == "orca": + default_image = Config.ORCA_IMAGE + default_name = "orca" + + # Set orca required cpus + no_of_procs = get_no_of_procs(orca_method_file) + if no_of_procs != -1: + doc['spec']['template']['spec']['containers'][0]['resources']['requests']['cpu'] = no_of_procs + elif method == 'parmtsnecv': + default_image = Config.PARMTSNECV_IMAGE + default_name = 'parmtsnecv' + else: + default_image = Config.GMX_IMAGE + default_name = 'gromacs' + + double_env = {'name': "GMX_DOUBLE", 'value': DoubleQuotedScalarString("ON" if params["double"] else "OFF")} + rdtscp_env = {'name': "GMX_RDTSCP", 'value': DoubleQuotedScalarString("ON" if params["rdtscp"] else "OFF")} + arch_env = {'name': "GMX_ARCH", 'value': DoubleQuotedScalarString(params["arch"])} + doc['spec']['template']['spec']['containers'][0]['env'] = [double_env, rdtscp_env, arch_env] + + + + identificator = "{}-{}-rdtscp-{}".format(default_name, method, timestamp) + # Set names + doc['metadata']['name'] = identificator + doc['spec']['template']['spec']['containers'][0]['name'] = "{}-{}-deployment-{}".format(default_name, method, timestamp) + doc['spec']['template']['metadata']['labels']['app'] = identificator + # Set gromacs/orca command + doc['spec']['template']['spec']['containers'][0]['args'] = ["/bin/bash", "-c", DoubleQuotedScalarString(command)] + # Set image + doc['spec']['template']['spec']['containers'][0]['image'] = default_image if not params["image"] else params["image"] + # Set working directory + doc['spec']['template']['spec']['containers'][0]['workingDir'] = "/tmp/" + if params["workdir"]: + doc['spec']['template']['spec']['containers'][0]['workingDir'] += params["workdir"] + + # set PVC + pvc_name = os.environ['PVC_NAME'] + if len(pvc_name) == 0: + raise Exception("Error setting pvc_name, probably problem in setting env variable of actual container") + doc['spec']['template']['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_name + + # If parallel is enabled set label so kubectl logs can print logs according to label + if params["parallel"]: + with open(f"{Config.PICKLE_PATH}/lock.pkl","rb") as fp: + lock_object = pickle.load(fp) + if len(lock_object['Parallel_label']) == 0: + label = {"Parallel_label": identificator, "Count": 0} + with open(f"{Config.PICKLE_PATH}/lock.pkl","wb") as fp: + pickle.dump(label, fp) + else: + doc['spec']['template']['metadata']['labels']['app'] = lock_object['Parallel_label'] + + # Write to file + ofile_name = "{}-{}-rdtscp.yaml".format(default_name, method) + with open(ofile_name, "w") as ofile: + ruamel.yaml.round_trip_dump(doc, ofile, explicit_start=True) + + return ofile_name, identificator + + +def run_job(kubernetes_config, label, parallel): + os.system(f"kubectl apply -f {kubernetes_config}") + + if not parallel: + return run_wait(f"-l {label} -c 1") + + # increment pickle count + with open(f"{Config.PICKLE_PATH}/lock.pkl","rb") as fp: + lock_object = pickle.load(fp) + with open(f"{Config.PICKLE_PATH}/lock.pkl","wb") as fp: + lock_object['Count'] += 1 + pickle.dump(lock_object, fp) + + +def get_no_of_procs(orca_method_file): + with open(orca_method_file) as ifile: + for line in ifile.readlines(): + if "nprocs" in line: + return int(line.split()[1]) + return -1 + + +def run_wait(command): + cmd = f"{Config.KUBERNETES_WAIT_PATH}/kubernetes-wait.sh {command}" + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + + return process.communicate()[0].decode('utf-8', 'ignore') + diff --git a/modules/gmx_orca/kubernetes-template.yaml b/modules/k8s/kubernetes-template.yaml similarity index 100% rename from modules/gmx_orca/kubernetes-template.yaml rename to modules/k8s/kubernetes-template.yaml diff --git a/modules/gmx_orca/kubernetes-wait.sh b/modules/k8s/kubernetes-wait.sh similarity index 100% rename from modules/gmx_orca/kubernetes-wait.sh rename to modules/k8s/kubernetes-wait.sh diff --git a/modules/gmx_orca/lock.pkl b/modules/k8s/lock.pkl similarity index 100% rename from modules/gmx_orca/lock.pkl rename to modules/k8s/lock.pkl diff --git a/modules/replace.py b/modules/replace.py deleted file mode 100644 index 0789f015ef1445dbf04e88b1f578bbffb83a5b01..0000000000000000000000000000000000000000 --- a/modules/replace.py +++ /dev/null @@ -1,16 +0,0 @@ -def replace_text_for_embedding(input_file=None, stride=None, file_name=None, weight=None): - #change file - with open(input_file, "r") as ifile: - data = ifile.read() - if stride != None: - data = data.replace("STRIDE=" + str(stride[0]), "STRIDE=" + str(stride[1])) - if file_name != None: - data = data.replace("FILE=" + file_name[0], "FILE=" + file_name[1]) - if weight != None: - data = data.replace(str(weight[0]), str(weight[1])) - if data == "": - raise SystemExit("Operation not supported") - - #write final output - with open(input_file, "w") as ofile: - ofile.writelines(data) diff --git a/parmtSNEcv/Dockerfile b/parmtSNEcv/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..546bcac612c853445d7f9f28eb17b3f959e937d2 --- /dev/null +++ b/parmtSNEcv/Dockerfile @@ -0,0 +1,24 @@ +FROM python:2 AS build + +WORKDIR /opt +RUN git clone https://github.com/spiwokv/parmtSNEcv.git + +## +COPY setup.py /opt/parmtSNEcv +## + +RUN python -m virtualenv venv + +WORKDIR /opt/parmtSNEcv +RUN bash -c "source /opt/venv/bin/activate && python -m pip install ." + + +#alpine +FROM python:2-slim + +RUN apt-get update && apt-get install libgomp1 + +COPY --from=build /opt/venv /opt/venv + +ENV PATH="/opt/venv/bin:$PATH" + diff --git a/pipelineJupyter.ipynb b/pipelineJupyter.ipynb index 04acc14f3e0a3bacb08b07c5d66a4889715a0dc5..06ec1e74c2e5fb50b3989f8da32319a793f934b0 100644 --- a/pipelineJupyter.ipynb +++ b/pipelineJupyter.ipynb @@ -4,7 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Definition of computational parameters" + "# Property Map Collective Variable Force Field Correction Pipeline\n", + "---\n", + "The pipeline generates force field corrections in .pdb format and is divided into these steps:\n", + "\n", + "1. [Molecule shape processing](#1.-Molecule-shape-processing)\n", + "2. [Preparation of environment and molecule](#2.-Preparation-of-environment-and-molecule)\n", + "3. [Generation of representative configurations](#3.-Generation-of-representative-configurations)\n", + "4. [Accurate energy computation](#4.-Accurate-energy-computation)\n", + "5. [Inaccurate energy computation](#5.-Inaccurate-energy-computation)\n", + "6. [Define correction of force field](#6.-Define-correction-of-force-field)" ] }, { @@ -14,58 +23,37 @@ "outputs": [], "source": [ "import os\n", - "import sys\n", - "import shutil\n", "import re\n", + "import sys\n", "import math\n", "import time\n", + "import shutil\n", "import subprocess\n", - "import nglview as nv\n", - "import pytraj as pt\n", "\n", - "#insert path to python scripts\n", - "sys.path.insert(1,\"/home/base/modules\")\n", - "sys.path.insert(1,\"/home/base/modules/gmx_orca\")\n", - "\n", - "from draw_2d import *\n", - "from draw_3d import drawit\n", + "# import chemical software\n", + "from rdkit.Chem.Draw import IPythonConsole\n", + "from rdkit import Chem\n", "from tqdm.notebook import tqdm\n", - "\n", "from molvs import Standardizer\n", + "import nglview as nv\n", + "import pytraj as pt\n", + "import matplotlib.pyplot as plt\n", "\n", - "from rdkit import Chem\n", - "from rdkit.Chem import Draw\n", - "from rdkit.Chem import SDWriter\n", - "from rdkit.Chem import AllChem\n", - "from rdkit.Chem import MolFromSmarts\n", - "from rdkit.Chem import rdmolops\n", - "from rdkit.Chem import rdchem\n", - "\n", - "from convert import convert_to_orca_methods\n", - "from plot_graph import plot_landmarks\n", - "from gmx_orca_run import gmx_run, orca_run, parallel_wait\n", - "from replace import replace_text_for_embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#number of configurations to generate in basic energy minimization\n", - "numc = 50\n", + "# import custom libraries\n", + "from modules.draw_3d import drawit\n", + "from modules.convert import convert_to_orca_methods\n", + "from modules.plot_graph import plot_landmarks\n", + "from modules.k8s.k8s_run import gmx_run, orca_run, parmtsnecv_run, parallel_wait\n", "\n", - "#path to applications\n", - "orca_job_check = \"/home/base/modules/orcajobcheck.py\"\n", - "xyz_to_mol = \"/home/base/modules/xyz2mol/xyz2mol.py\"" + "# path to needed scripts\n", + "orca_job_check = '/home/base/modules/orcajobcheck.py'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Molecule shape processing" + "## 1. Molecule shape processing" ] }, { @@ -74,12 +62,19 @@ "metadata": {}, "outputs": [], "source": [ - "#specify input of desired molecule in SMILES\n", - "smiles_molecule = \"CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)C[NH+]3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5\"\n", + "# specify input of desired molecule in SMILES\n", + "smiles_molecule = 'CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)C[NH+]3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5'\n", + "\n", + "# set visualization parameters\n", + "# ... add Indices to molecule image\n", + "IPythonConsole.drawOptions.addAtomIndices = True\n", + "\n", + "# ... set molecule size\n", + "IPythonConsole.molSize = 900,900\n", "\n", - "molecule = Chem.MolFromSmiles(smiles_molecule)\n", "\n", - "render_svg(moltosvg(molecule))" + "molecule = Chem.MolFromSmiles(smiles_molecule)\n", + "molecule" ] }, { @@ -88,16 +83,23 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"molekula.smi\", \"w\") as smifile:\n", - " smifile.write(smiles_molecule)\n", - " \n", "s = Standardizer()\n", "molecule = s.standardize(molecule)\n", "molecule = Chem.AddHs(molecule)\n", "natoms = molecule.GetNumAtoms()\n", "charge = Chem.rdmolops.GetFormalCharge(molecule)\n", "\n", - "render_svg(moltosvg(molecule))" + "molecule" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Set the lowest energy configuration\n", + "Perform basic energy minimisation by running [Merck molecular force field (MMFF94)](https://open-babel.readthedocs.io/en/latest/Forcefields/mmff94.html) and choose the conformation with the lowest energy.\n", + "\n", + "Visualize the configuration with lowest energy in 3D afterwards." ] }, { @@ -106,40 +108,34 @@ "metadata": {}, "outputs": [], "source": [ - "#set lowest energy configuration\n", + "# number of configurations to generate\n", + "numc = 50\n", "\n", - "#set default id of minimal energy conformer to -1\n", - "minid = -1\n", - "#set default minimal energy to highest possible float number\n", - "minene = sys.float_info.max\n", + "Chem.AllChem.EmbedMultipleConfs(molecule, clearConfs=True, numConfs=numc)\n", "\n", - "AllChem.EmbedMultipleConfs(molecule, clearConfs=True, numConfs=numc)\n", - "#run 'MMFF94 force field' energy evaluation\n", - "done = AllChem.MMFFOptimizeMoleculeConfs(molecule)\n", + "# run MMFF94\n", + "optim = Chem.AllChem.MMFFOptimizeMoleculeConfs(molecule)\n", "\n", - "for i in range(len(done)):\n", - " if done[i][1]<minene:\n", - " minene = done[i][1]\n", + "minid = -1\n", + "minene = sys.float_info.max\n", + "for i in range(len(optim)):\n", + " if optim[i][1] < minene:\n", + " minene = optim[i][1]\n", " minid = i\n", - "print(f'Minimal energy: {minene}')\n", "\n", - "writer = SDWriter(\"molekula.mol\")\n", - "writer.write(molecule, confId=minid)\n", + "# write to file molekula.mol for further processing\n", + "writer = Chem.SDWriter('molekula.mol')\n", + "writer.write(molecule, confId = minid)\n", "\n", - "drawit(molecule)" + "drawit(molecule, confId = minid)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#set pattern to detect torsion angles\n", - "\n", - "RotatableBond = Chem.MolFromSmarts('[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]')\n", - "rotatables = molecule.GetSubstructMatches(RotatableBond)\n", - "print(rotatables)" + "### 1.2 Detect torsion angles\n", + "Detect torsions based on pattern in *smarts*. Output can be checked on 2D visualization in step [2. Molecule shape processing](#2.-Molecule-shape-processing)." ] }, { @@ -148,26 +144,36 @@ "metadata": {}, "outputs": [], "source": [ - "#get numbers of atoms that form torsion angles\n", + "bond_smarts = ''.join(('[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]-&!@',\n", + " '[!$([NH]!@C(=O))&!D1&!$(*#*)&!$([C;H3])&!$([O;H1])&!$([N;H3])]'))\n", + "\n", + "rotatable_bond = Chem.MolFromSmarts(bond_smarts)\n", + "rotatables = molecule.GetSubstructMatches(rotatable_bond)\n", + "print(f'Rotatables: {rotatables}')\n", + "\n", "\n", "torsions = []\n", "for rotatable in rotatables:\n", " pairs1 = []\n", " pairs2 = []\n", " for bond in molecule.GetBonds():\n", - " if rotatable[0]==bond.GetBeginAtomIdx() and rotatable[1]!=bond.GetEndAtomIdx():\n", - " pairs1.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])\n", - " if rotatable[1]==bond.GetBeginAtomIdx() and rotatable[0]!=bond.GetEndAtomIdx():\n", - " pairs2.append([bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()])\n", - " torsions.append([pairs1[0][1],pairs1[0][0],pairs2[0][0],pairs2[0][1]])\n", - "print(torsions)" + " if rotatable[0] == bond.GetBeginAtomIdx() and rotatable[1] != bond.GetEndAtomIdx():\n", + " pairs1.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])\n", + " if rotatable[1] == bond.GetBeginAtomIdx() and rotatable[0] != bond.GetEndAtomIdx():\n", + " pairs2.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])\n", + " torsions.append([pairs1[0][1], pairs1[0][0], pairs2[0][0], pairs2[0][1]])\n", + "print(f'Torsions: {torsions}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Preparation of environment and molecule" + "## 2. Preparation of environment and molecule\n", + "### 2.1 Perform energetic minimisation\n", + "Generate config file and other files needed for energetic minimisation. During generation an ordering of atoms will get wrong. This is fixed afterwards from output files to fit the minimised molecule.\n", + "\n", + "Finally perform an energetic minimisation using [Gromacs](https://www.gromacs.org/). This pipeline uses wrapper so the Gromacs can be run independently to this environment. Please read the wrapper [documentation](https://github.com/CERIT-SC/pmcvff-correction/tree/jupyter-refactor/modules/gmx_orca) before you interact with any Gromacs command." ] }, { @@ -176,13 +182,35 @@ "metadata": {}, "outputs": [], "source": [ - "# prepare input files for energetic minimization\n", + "with open(\"em/em.mdp\", \"w\") as emfile:\n", + " emfile.write(\"integrator = steep\\n\")\n", + " emfile.write(\"nsteps = 100000\\n\")\n", + " emfile.write(\"emtol = 0\\n\")\n", + " emfile.write(\"emstep = 0.1\\n\")\n", + " emfile.write(\"nstcomm = 1\\n\")\n", + " emfile.write(\"nstxout = 100\\n\")\n", + " emfile.write(\"nstvout = 100\\n\")\n", + " emfile.write(\"nstfout = 0\\n\")\n", + " emfile.write(\"nstlog = 100\\n\")\n", + " emfile.write(\"nstenergy = 100\\n\")\n", + " emfile.write(\"nstlist = 1\\n\")\n", + " emfile.write(\"ns_type = grid\\n\")\n", + " emfile.write(\"coulombtype = cut-off\\n\")\n", + " emfile.write(\"rlist = 1.4\\n\")\n", + " emfile.write(\"rcoulomb = 1.4\\n\")\n", + " emfile.write(\"rvdw = 1.4\\n\")\n", + " emfile.write(\"energygrps = System\\n\")\n", + " emfile.write(\"epsilon-r = 80\\n\")\n", + " emfile.write(\"\\n\")\n", "\n", - "netcharge = rdmolops.GetFormalCharge(molecule)\n", - "!antechamber -i molekula.mol -fi mdl -o molekula.prepi -fo prepi -c bcc -nc {netcharge} && \\\n", + "\n", + "!antechamber -i molekula.mol -fi mdl -o molekula.prepi -fo prepi -c bcc -nc {charge} && \\\n", "parmchk2 -i molekula.prepi -f prepi -o molekula.frcmod && \\\n", "tleap -f tleapin.txt && \\\n", - "acpype -p molekula.prmtop -x molekula.inpcrd" + "acpype -p molekula.prmtop -x molekula.inpcrd\n", + "\n", + "shutil.copy(\"MOL_GMX.gro\", \"em/\")\n", + "shutil.copy(\"MOL_GMX.top\", \"em/\")" ] }, { @@ -191,20 +219,20 @@ "metadata": {}, "outputs": [], "source": [ - "#fix ordering of atoms\n", + "# fix ordering of atoms\n", "\n", "order_before = []\n", - "order_after = []\n", - "\n", "with open('sqm.pdb','r') as pdbfile:\n", " for atom in pdbfile.readlines():\n", " order_before.append(atom.split()[2])\n", " \n", + "order_after = []\n", "with open('MOL_GMX.gro','r') as grofile:\n", " for atom in grofile.readlines():\n", " if atom.startswith(' 1 MOL'):\n", " order_after.append(atom.split()[2])\n", "\n", + " \n", "torsions_new = []\n", "torsion_new = []\n", "for torsion in torsions:\n", @@ -212,8 +240,9 @@ " torsion_new.append(order_after.index(order_before[i])+1)\n", " torsions_new.append(torsion_new)\n", " torsion_new = []\n", + " \n", "torsions = torsions_new\n", - "print(torsions)" + "print(f'New torsions: {torsions}')" ] }, { @@ -222,40 +251,19 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -p em\n", - "with open(\"em/em.mdp\", \"w\") as emfile:\n", - " emfile.write(\"integrator = steep\\n\")\n", - " emfile.write(\"nsteps = 100000\\n\")\n", - " emfile.write(\"emtol = 0\\n\")\n", - " emfile.write(\"emstep = 0.1\\n\")\n", - " emfile.write(\"nstcomm = 1\\n\")\n", - " emfile.write(\"nstxout = 100\\n\")\n", - " emfile.write(\"nstvout = 100\\n\")\n", - " emfile.write(\"nstfout = 0\\n\")\n", - " emfile.write(\"nstlog = 100\\n\")\n", - " emfile.write(\"nstenergy = 100\\n\")\n", - " emfile.write(\"nstlist = 1\\n\")\n", - " emfile.write(\"ns_type = grid\\n\")\n", - " emfile.write(\"coulombtype = cut-off\\n\")\n", - " emfile.write(\"rlist = 1.4\\n\")\n", - " emfile.write(\"rcoulomb = 1.4\\n\")\n", - " emfile.write(\"rvdw = 1.4\\n\")\n", - " emfile.write(\"energygrps = System\\n\")\n", - " emfile.write(\"epsilon-r = 80\\n\")\n", - " emfile.write(\"\\n\")\n", - "shutil.copy(\"MOL_GMX.gro\", \"em/\")\n", - "shutil.copy(\"MOL_GMX.top\", \"em/\")" + "gmx_run('editconf -f MOL_GMX -o box -c -box 3 3 3', workdir='em')\n", + "gmx_run('grompp -f em.mdp -c box -p MOL_GMX -o em1', workdir='em')\n", + "gmx_run('mdrun -deffnm em1 -ntomp 2', workdir='em')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "gmx_run(\"editconf -f MOL_GMX -o box -c -box 3 3 3\", workdir=\"em\")\n", - "gmx_run(\"grompp -f em.mdp -c box -p MOL_GMX -o em1\", workdir=\"em\")\n", - "gmx_run(\"mdrun -deffnm em1 -ntomp 2\", workdir=\"em\")" + "### 2.2 Perform molecular dynamics simulation\n", + "Create config file and perform molecular dynamics simulation. Simulation trajectory can be visualized.\n", + "\n", + "Afterwards a [periodic boundary conditions](https://www.gromacs.org/Documentation_of_outdated_versions/Terminology/Periodic_Boundary_Conditions) must be applied so the molecule \"does not jump out of the box\". " ] }, { @@ -264,37 +272,34 @@ "metadata": {}, "outputs": [], "source": [ - "#prepare files for getting the molecule to dynamic state\n", - "\n", - "!mkdir -p md\n", - "with open(\"md/md.mdp\", \"w\") as mdfile:\n", - " mdfile.write(\"integrator = sd\\n\")\n", - " mdfile.write(\"nsteps = 100000\\n\")\n", - " mdfile.write(\"dt = 0.001\\n\")\n", - " mdfile.write(\"nstxout = 1000\\n\")\n", - " mdfile.write(\"nstvout = 1000\\n\")\n", - " mdfile.write(\"nstenergy = 1000\\n\")\n", - " mdfile.write(\"nstlog = 1000\\n\")\n", - " mdfile.write(\"continuation = no\\n\")\n", - " mdfile.write(\"constraints = none\\n\")\n", - " mdfile.write(\"cutoff-scheme = Verlet\\n\")\n", - " mdfile.write(\"ns_type = grid\\n\")\n", - " mdfile.write(\"nstlist = 1\\n\")\n", - " mdfile.write(\"rlist = 1.4\\n\")\n", - " mdfile.write(\"rcoulomb = 1.4\\n\")\n", - " mdfile.write(\"rvdw = 1.4\\n\")\n", - " mdfile.write(\"coulombtype = cut-off\\n\")\n", - " mdfile.write(\"tcoupl = V-rescale\\n\")\n", - " mdfile.write(\"tc-grps = system\\n\")\n", - " mdfile.write(\"tau_t = 0.1\\n\")\n", - " mdfile.write(\"ref_t = 300\\n\")\n", - " mdfile.write(\"pcoupl = no\\n\")\n", - " mdfile.write(\"pbc = xyz\\n\")\n", - " mdfile.write(\"gen_vel = yes\\n\")\n", - " mdfile.write(\"epsilon-r = 80\\n\")\n", - " mdfile.write(\"\\n\")\n", - "shutil.copy(\"em/em1.gro\", \"md/\")\n", - "shutil.copy(\"MOL_GMX.top\", \"md/\")" + "with open('md/md.mdp', 'w') as mdfile:\n", + " mdfile.write('integrator = sd\\n')\n", + " mdfile.write('nsteps = 100000\\n')\n", + " mdfile.write('dt = 0.001\\n')\n", + " mdfile.write('nstxout = 1000\\n')\n", + " mdfile.write('nstvout = 1000\\n')\n", + " mdfile.write('nstenergy = 1000\\n')\n", + " mdfile.write('nstlog = 1000\\n')\n", + " mdfile.write('continuation = no\\n')\n", + " mdfile.write('constraints = none\\n')\n", + " mdfile.write('cutoff-scheme = Verlet\\n')\n", + " mdfile.write('ns_type = grid\\n')\n", + " mdfile.write('nstlist = 1\\n')\n", + " mdfile.write('rlist = 1.4\\n')\n", + " mdfile.write('rcoulomb = 1.4\\n')\n", + " mdfile.write('rvdw = 1.4\\n')\n", + " mdfile.write('coulombtype = cut-off\\n')\n", + " mdfile.write('tcoupl = V-rescale\\n')\n", + " mdfile.write('tc-grps = system\\n')\n", + " mdfile.write('tau_t = 0.1\\n')\n", + " mdfile.write('ref_t = 300\\n')\n", + " mdfile.write('pcoupl = no\\n')\n", + " mdfile.write('pbc = xyz\\n')\n", + " mdfile.write('gen_vel = yes\\n')\n", + " mdfile.write('epsilon-r = 80\\n')\n", + " mdfile.write('\\n')\n", + "shutil.copy('em/em1.gro', 'md/')\n", + "shutil.copy('MOL_GMX.top', 'md/')" ] }, { @@ -303,8 +308,8 @@ "metadata": {}, "outputs": [], "source": [ - "gmx_run(\"grompp -f md.mdp -c em1 -p MOL_GMX -o md1\", workdir=\"md\")\n", - "gmx_run(\"mdrun -deffnm md1 -ntomp 2\", workdir=\"md\")" + "gmx_run('grompp -f md.mdp -c em1 -p MOL_GMX -o md1', workdir='md')\n", + "gmx_run('mdrun -deffnm md1 -ntomp 2', workdir='md')" ] }, { @@ -315,15 +320,25 @@ }, "outputs": [], "source": [ - "#show trajectory of conformations\n", - "#select group for trjconv evaluation\n", - "#Group 0 ( System)\n", - "#Group 1 ( Other)\n", - "#Group 2 ( MOL)\n", - "group = \"0\"\n", + "# convert trajectory to .pdb format so it can be visualized\n", "\n", - "gmx_run(\"trjconv -s md1.tpr -f md1.trr -o outTraj.pdb\", workdir=\"md\", groups=group)\n", + "# select group for trjconv evaluation output\n", + "# Group 0 ( System)\n", + "# Group 1 ( Other)\n", + "# Group 2 ( MOL)\n", + "group = '0'\n", "\n", + "\n", + "gmx_run('trjconv -pbc nojump -s md1.tpr -f md1.trr -o outTraj.pdb', workdir='md', groups=group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# visualize the molecular dynamics trajectory\n", "traj = pt.load('md/outTraj.pdb')\n", "view = nv.show_pytraj(traj)\n", "view" @@ -335,25 +350,32 @@ "metadata": {}, "outputs": [], "source": [ - "#fix periodic boundaries errors when the molecule jumps out of the box\n", - "#select group for trjconv evaluation\n", - "#Group 0 ( System)\n", - "#Group 1 ( Other)\n", - "#Group 2 ( MOL)\n", - "group = \"1\"\n", + "# fix periodic boundaries errors \n", + "\n", + "# select group for trjconv evaluation output\n", + "# Group 0 ( System)\n", + "# Group 1 ( Other)\n", + "# Group 2 ( MOL)\n", + "group = '1'\n", + "\n", "\n", "for i in tqdm(range(len(torsions))):\n", " fr = str(float(100-len(torsions)+i)-0.01)\n", " to = str(float(100-len(torsions)+i)+0.01)\n", - " gmx_run(f\"trjconv -pbc nojump -s md1 -f md1 -o frame{i}.gro -b {fr} -e {to}<<EOF\\n0\\nEOF\", workdir=\"md\", groups=group)" + " gmx_run(f'trjconv -pbc nojump -s md1 -f md1 -o frame{i}.gro -b {fr} -e {to}<<EOF\\n0\\nEOF',\n", + " workdir='md',\n", + " groups=group)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Generation of representative configurations\n", - "### 4.1 Trajectory generation" + "## 3. Generation of representative configurations\n", + "### 3.1 Trajectory generation\n", + "Create config file and *plumed.dat* file. Based on these files run the simulation (with metadynamics) to generate a trajectory.\n", + "\n", + "[Plumed](https://www.plumed.org/) is used to run metadynamics." ] }, { @@ -362,33 +384,32 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -p mtd\n", - "with open(\"mtd/mtd.mdp\", \"w\") as mtdfile:\n", - " mtdfile.write(\"integrator = sd\\n\")\n", - " mtdfile.write(\"nsteps = 10000000\\n\") #10 ns each replica\n", - " mtdfile.write(\"dt = 0.001\\n\")\n", - " mtdfile.write(\"nstxout = 10000\\n\") # output coordinates every 10ps to *.trr\n", - " mtdfile.write(\"nstvout = 10000\\n\") # output velocities every 10ps to *.trr\n", - " mtdfile.write(\"nstenergy = 1000\\n\")\n", - " mtdfile.write(\"nstlog = 1000\\n\")\n", - " mtdfile.write(\"continuation = no\\n\")\n", - " mtdfile.write(\"constraints = none\\n\")\n", - " mtdfile.write(\"cutoff-scheme = Verlet\\n\")\n", - " mtdfile.write(\"ns_type = grid\\n\")\n", - " mtdfile.write(\"nstlist = 1\\n\")\n", - " mtdfile.write(\"rlist = 1.4\\n\")\n", - " mtdfile.write(\"rcoulomb = 1.4\\n\")\n", - " mtdfile.write(\"rvdw = 1.4\\n\")\n", - " mtdfile.write(\"coulombtype = cut-off\\n\")\n", - " mtdfile.write(\"tcoupl = V-rescale\\n\")\n", - " mtdfile.write(\"tc-grps = system\\n\")\n", - " mtdfile.write(\"tau_t = 0.1\\n\")\n", - " mtdfile.write(\"ref_t = 300\\n\")\n", - " mtdfile.write(\"pcoupl = no\\n\")\n", - " mtdfile.write(\"pbc = xyz\\n\")\n", - " mtdfile.write(\"gen_vel = yes\\n\")\n", - " mtdfile.write(\"epsilon-r = 80\\n\")\n", - " mtdfile.write(\"\\n\")" + "with open('mtd/mtd.mdp', 'w') as mtdfile:\n", + " mtdfile.write('integrator = sd\\n')\n", + " mtdfile.write('nsteps = 10000000\\n')\n", + " mtdfile.write('dt = 0.001\\n')\n", + " mtdfile.write('nstxout = 10000\\n')\n", + " mtdfile.write('nstvout = 10000\\n')\n", + " mtdfile.write('nstenergy = 1000\\n')\n", + " mtdfile.write('nstlog = 1000\\n')\n", + " mtdfile.write('continuation = no\\n')\n", + " mtdfile.write('constraints = none\\n')\n", + " mtdfile.write('cutoff-scheme = Verlet\\n')\n", + " mtdfile.write('ns_type = grid\\n')\n", + " mtdfile.write('nstlist = 1\\n')\n", + " mtdfile.write('rlist = 1.4\\n')\n", + " mtdfile.write('rcoulomb = 1.4\\n')\n", + " mtdfile.write('rvdw = 1.4\\n')\n", + " mtdfile.write('coulombtype = cut-off\\n')\n", + " mtdfile.write('tcoupl = V-rescale\\n')\n", + " mtdfile.write('tc-grps = system\\n')\n", + " mtdfile.write('tau_t = 0.1\\n')\n", + " mtdfile.write('ref_t = 300\\n')\n", + " mtdfile.write('pcoupl = no\\n')\n", + " mtdfile.write('pbc = xyz\\n')\n", + " mtdfile.write('gen_vel = yes\\n')\n", + " mtdfile.write('epsilon-r = 80\\n')\n", + " mtdfile.write('\\n')" ] }, { @@ -398,22 +419,25 @@ "outputs": [], "source": [ "for i in range(len(torsions)):\n", - " if not os.path.exists(\"mtd/w{}\".format(i)):\n", - " !mkdir -p mtd/w{i}\n", - " with open(\"mtd/w{}/plumed.dat\".format(i), \"w\") as plumeddat:\n", - " plumeddat.write(\"RANDOM_EXCHANGES\\n\")\n", - " plumeddat.write(\"WHOLEMOLECULES ENTITY0=1-{}\\n\".format(natoms))\n", + " if not os.path.exists(f'mtd/w{i}'):\n", + " os.mkdir(f'mtd/w{i}')\n", + " \n", + " with open(f'mtd/w{i}/plumed.dat', \"w\") as plumeddat:\n", + " plumeddat.write('RANDOM_EXCHANGES\\n')\n", + " plumeddat.write(f'WHOLEMOLECULES ENTITY0=1-{natoms}\\n')\n", " for j in range(len(torsions)):\n", - " plumeddat.write(\"TORSION ATOMS={},{},{},{} LABEL=cv{}\\n\".format(torsions[j][0],torsions[j][1],torsions[j][2],torsions[j][3],j+1))\n", - " plumeddat.write(\"METAD ARG=cv{} HEIGHT=0.5 SIGMA=0.3 PACE=1000 GRID_MIN=-pi GRID_MAX=pi BIASFACTOR=15 LABEL=be\\n\".format(i+1))\n", + " line = ','.join((f'TORSION ATOMS={torsions[j][0]},{torsions[j][1]}',\n", + " f'{torsions[j][2]},{torsions[j][3]} LABEL=cv{j+1}\\n'))\n", + " plumeddat.write(line)\n", + " line = ' '.join((f'METAD ARG=cv{i+1} HEIGHT=0.5 SIGMA=0.3 PACE=1000 GRID_MIN=-pi',\n", + " 'GRID_MAX=pi BIASFACTOR=15 LABEL=be\\n'))\n", + " plumeddat.write(line)\n", " cvs = \"\"\n", " for j in range(len(torsions)):\n", - " cvs=cvs+\"cv{},\".format(j+1)\n", + " cvs = cvs + f'cv{j+1},'\n", " cvs = cvs[:-1]\n", - " plumeddat.write(\"PRINT ARG={} STRIDE=1000 FILE=COLVAR\\n\".format(cvs))\n", - " plumeddat.write(\"PRINT ARG=be.bias STRIDE=1000 FILE=BIAS\\n\")\n", - "\n", - "shutil.copy(\"MOL_GMX.top\", \"mtd/\")" + " plumeddat.write(f'PRINT ARG={cvs} STRIDE=1000 FILE=COLVAR\\n')\n", + " plumeddat.write('PRINT ARG=be.bias STRIDE=1000 FILE=BIAS\\n')" ] }, { @@ -422,9 +446,14 @@ "metadata": {}, "outputs": [], "source": [ + "shutil.copy('MOL_GMX.top', 'mtd/')\n", + "\n", + "# perform preprocessing before generation of the trajectory\n", "for i in tqdm(range(len(torsions))):\n", - " shutil.copy(\"md/frame{}.gro\".format(i), \"mtd/w{}/\".format(i))\n", - " gmx_run(f\"grompp -f mtd.mdp -c w{i}/frame{i} -p MOL_GMX -o w{i}/mtd1\", workdir=\"mtd\", parallel=True)\n", + " shutil.copy(f'md/frame{i}.gro', f'mtd/w{i}/')\n", + " gmx_run(f'grompp -f mtd.mdp -c w{i}/frame{i} -p MOL_GMX -o w{i}/mtd1',\n", + " workdir='mtd',\n", + " parallel=True)\n", "parallel_wait()" ] }, @@ -434,21 +463,25 @@ "metadata": {}, "outputs": [], "source": [ - "directories = \"\"\n", + "directories = ''\n", "for i in range(len(torsions)):\n", - " directories = directories + \"w{} \".format(i)\n", + " directories = directories + f'w{i} '\n", "\n", - "#see mdrunlog in mtd directory for insight to mdrun simulation (e.g 'tail -f mdrunlog')\n", - "gmx_run(f\"mdrun -g mdrunlog -ntomp 1 -deffnm mtd1 -replex 500 -plumed plumed.dat -multidir {directories}\", \n", - " workdir=\"mtd\", \n", - " mpi_run=len(torsions))\n" + "# see mdrunlog in mtd directory for insight into running \n", + "# mdrun simulation (e.g 'tail -f mdrunlog')\n", + "gmx_run(f'mdrun -g mdrunlog -ntomp 1 -deffnm mtd1 -replex 500 -plumed plumed.dat -multidir {directories}', \n", + " workdir='mtd', \n", + " mpi_run=len(torsions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 4.2 Configurations clustering" + "### 3.2 Configurations clustering\n", + "Concatinate all the trajectories that simulation produced. Then cluster this trajectory to groups for which one representative configuration is chosen (*cutoff* can be modified for more/less clusters).\n", + "\n", + "Result of [Gromacs clustering](https://manual.gromacs.org/documentation/current/onlinehelp/gmx-cluster.html) is .pdb file containing all representative configurations. These must be divided into separate .pdb files for further processing." ] }, { @@ -457,53 +490,70 @@ "metadata": {}, "outputs": [], "source": [ - "#select groups for cluster evaluation\n", - "#Group 0 ( System)\n", - "#Group 1 ( Other)\n", - "#Group 2 ( MOL)\n", - "#Group 3 ( Custom)\n", - "groups = \"30\"\n", - "!mkdir -p -m 757 clustering ; mkdir -p -m 757 clustering/outClustersPDB ; mkdir -p -m 757 clustering/outClustersXYZ ; mkdir -p -m 757 clustering/outClusters\n", - "\n", - "trajectories = \"\"\n", + "trajectories = ''\n", "for i in range(len(torsions)):\n", - " trajectories = trajectories + f\"mtd/w{i}/mtd1.trr \"\n", + " trajectories = trajectories + f'mtd/w{i}/mtd1.trr '\n", + "\n", + "# concatinate trajectories \n", + "gmx_run(f'trjcat -f {trajectories} -cat -o mtd/mtd1.trr')\n", "\n", - "#concatinate trajectories \n", - "gmx_run(f\"trjcat -f {trajectories} -cat -o mtd/mtd1.trr\")\n", - "#make index file with non-hydrogen atoms, use 1 & ! a H* and q for input\n", + "# make index file with non-hydrogen atoms\n", "gmx_run(\"make_ndx -f md/md1.tpr -o mtd/index.ndx\", make_ndx=\"1&!aH*\")\n", - "#include index file and select index group 3\n", - "gmx_run(\"cluster -method gromos -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -n mtd/index.ndx -cutoff 0.15 -cl clustering/outCluster.pdb\", \n", - " groups=groups)\n", - "\n", - "\n", - "#divide all clusters from gmx cluster to single clusters\n", - "with open(\"clustering/outCluster.pdb\") as input_cluster:\n", - " i = 0\n", - " outFile = open(\"clustering/outClustersPDB/outCluster{}.pdb\".format(i), \"w\")\n", - " for line in input_cluster:\n", - " if line != \"ENDMDL\\n\":\n", - " outFile.write(line)\n", - " continue\n", - " outFile.write(\"ENDMDL\\n\")\n", - " i += 1\n", - " outFile.close()\n", - " outFile = open(\"clustering/outClustersPDB/outCluster{}.pdb\".format(i), \"w\")\n", - "!rm clustering/outClustersPDB/outCluster{i}.pdb\n", - "clusters_ids = [x for x in range(0, i)]\n", - "\n", - "\n", - "#convert .pdb clusters to .xyz\n", - "for pdb_cluster in tqdm(os.listdir(\"clustering/outClustersPDB/\")):\n", - " !babel -ipdb clustering/outClustersPDB/{pdb_cluster} -oxyz clustering/outClustersXYZ/{pdb_cluster.replace(\"pdb\", \"xyz\")}" + "\n", + "\n", + "# select groups for cluster evaluation output\n", + "# Group 0 ( System)\n", + "# Group 1 ( Other)\n", + "# Group 2 ( MOL)\n", + "# Group 3 ( Custom)\n", + "groups = '30'\n", + "\n", + "gmx_run('''cluster -method gromos -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -n mtd/index.ndx -cutoff 0.15 \\\n", + " -cl clustering/outClusters.pdb''', groups=groups)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# divide all clusters from clustering output file \n", + "# to single files and index them from 0.\n", + "# Also fix missing element of each ATOM on \n", + "# line 77 (by pdb format specification)\n", + "cluster_index = 0\n", + "i = 0\n", + "\n", + "with open('clustering/outClusters.pdb') as infile:\n", + " clusters = infile.readlines()\n", + " while i < len(clusters):\n", + " with open(f'clustering/outClustersPDB/outCluster{cluster_index}.pdb', 'w') as outfile:\n", + " for line in clusters[i:]:\n", + " split_line = line.split()\n", + " if split_line[0] == 'ATOM':\n", + " line = line[:77] + split_line[2][0] + '\\n'\n", + " outfile.write(line)\n", + " i += 1\n", + " if line == 'ENDMDL\\n':\n", + " break\n", + " cluster_index += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 4.3 Visualize landmarks" + "### 3.3 Visualize landmarks\n", + "Goal of this part is to compute embeddings which are visualized afterwards. Each step is performed on the trajectory which results from previous step. Base trajectory used in 1st step is the concatinated trajectory from metadynamics simulation.\n", + "\n", + "1. Apply periodic boundary conditions to metadynamics trajectory\n", + "2. Perform fitting on the trajectory\n", + "3. Remove Hydrogen\n", + "4. Train [parmtSNEcv](https://gitlab.ics.muni.cz/spiwokv/parmtSNEcv)\n", + "5. Compute embeddings\n", + "\n", + "Finally visualize all generated configurations from metadynamics trajectory in contrast to representative clusters configurations" ] }, { @@ -512,46 +562,36 @@ "metadata": {}, "outputs": [], "source": [ - "#apply periodic boundary conditions to mtd trajectory -> fitting -> no Hydrogen -> train parmtSNEcv -> compute embeddings \n", - "\n", - "!mkdir -p visualization/traj\n", - "\n", - "#Group    0 (        System)\n", - "#Group    1 (         Other)\n", - "#Group    2 (           MOL)\n", - "#Group    3 (        Custom)\n", - "#select group for periodic boundaries check output:\n", - "group = \"0\"\n", - "gmx_run(\"trjconv -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -pbc mol -o visualization/traj/mtd1_nopbc.xtc\", groups=group)\n", - "#select groups for fitting and output:\n", - "groups = \"00\"\n", - "gmx_run(\"trjconv -f visualization/traj/mtd1_nopbc.xtc -s clustering/outClustersPDB/outCluster0.pdb -fit rot+trans -o visualization/traj/mtd1_fit.xtc\", \n", - " groups=groups)\n", - "#select group for no Hydrogen output:\n", - "group = \"3\"\n", - "gmx_run(\"trjconv -f visualization/traj/mtd1_fit.xtc -n mtd/index.ndx -o visualization/traj/mtd1_fit_noH.xtc\",\n", + "# Group    0 (        System)\n", + "# Group    1 (         Other)\n", + "# Group    2 (           MOL)\n", + "# Group    3 (        Custom)\n", + "# select group for periodic boundaries check output:\n", + "group = '0'\n", + "gmx_run('trjconv -f mtd/mtd1.trr -s mtd/w0/mtd1.tpr -pbc mol -o visualization/traj/mtd1_nopbc.xtc', groups=group)\n", + "\n", + "# select groups for fitting and output:\n", + "groups = '00'\n", + "gmx_run('''trjconv -f visualization/traj/mtd1_nopbc.xtc -s clustering/outClustersPDB/outCluster0.pdb \\\n", + " -fit rot+trans -o visualization/traj/mtd1_fit.xtc''', groups=groups)\n", + "\n", + "# select group for no Hydrogen output:\n", + "group = '3'\n", + "gmx_run('trjconv -f visualization/traj/mtd1_fit.xtc -n mtd/index.ndx -o visualization/traj/mtd1_fit_noH.xtc',\n", " groups=group)\n", - "#select groups for size of box and output:\n", - "groups = \"33\"\n", - "gmx_run(\"editconf -f clustering/outClustersPDB/outCluster0.pdb -n mtd/index.ndx -box 3 3 3 -c -o visualization/ref.pdb\",\n", - " groups=groups)\n", "\n", - "\n", - "#train parmtSNEcv\n", - "!parmtSNEcv -i visualization/traj/mtd1_fit_noH.xtc -p visualization/ref.pdb -boxx 3 -boxy 3 -boxz 3 -dim 2 -layers 2 -o visualization/out.txt -plumed visualization/plumed.dat -epochs 2000\n", - "\n", - "\n", - "#modify plumed.dat to compute embedding in every step and change name of file for *** mtd trajectory ***\n", - "replace_text_for_embedding(\"visualization/plumed.dat\", stride=(100, 1), file_name=(\"COLVAR\", \"2d_embedding\"))\n", - "#fix weights of atoms\n", - "replace_text_for_embedding(\"visualization/ref.pdb\", weight=(0.00, 1.00))\n", - "#run as plumed\n", - "gmx_run(\"driver --plumed visualization/plumed.dat --mf_xtc visualization/traj/mtd1_fit.xtc\")\n", - "\n", - "#modify plumed.dat to compute embedding in every step and change name of file for *** representatives ***\n", - "replace_text_for_embedding(\"visualization/plumed.dat\", file_name=(\"2d_embedding\", \"landmarks\"))\n", - "#run as plumed\n", - "gmx_run(\"driver --plumed visualization/plumed.dat --mf_pdb clustering/outCluster.pdb\")" + "# select groups for size of the box and output:\n", + "groups = '33'\n", + "gmx_run('''editconf -f clustering/outClustersPDB/outCluster0.pdb -n mtd/index.ndx -box 3 3 3 -c \\\n", + " -o visualization/ref.pdb''', groups=groups)\n", + "\n", + "# fix weights of atoms\n", + "data = ''\n", + "with open('visualization/ref.pdb', 'r') as infile:\n", + " data = infile.read()\n", + " data = data.replace('0.00', '1.00')\n", + "with open('visualization/ref.pdb', 'w') as outfile:\n", + " outfile.writelines(data)" ] }, { @@ -560,32 +600,84 @@ "metadata": {}, "outputs": [], "source": [ - "#visualize\n", + "# train parmtSNEcv\n", + "parmtsnecv_run('''parmtSNEcv -i traj/mtd1_fit_noH.xtc -p ref.pdb -boxx 3 -boxy 3 -boxz 3 \\\n", + " -dim 2 -layers 2 -o out.txt -plumed plumed.dat -epochs 2000''', workdir='visualization')\n", + "\n", + "# modify plumed.dat to compute embedding in every step (100->1) \n", + "# and change the name of file for *** mtd trajectory ***\n", + "with open('visualization/plumed.dat', 'r') as infile:\n", + " data = infile.read()\n", + " data = data.replace('STRIDE=100', 'STRIDE=1')\n", + " data = data.replace('COLVAR', '2d_embedding')\n", + "with open('visualization/plumed.dat', 'w') as outfile:\n", + " outfile.writelines(data)\n", + "\n", + "# run with plumed\n", + "gmx_run('driver --plumed plumed.dat --mf_xtc traj/mtd1_fit.xtc', workdir='visualization')\n", + "\n", + "# modify plumed.dat to compute embedding in every step and change name of file for *** representatives ***\n", + "with open('visualization/plumed.dat', 'r') as infile:\n", + " data = infile.read()\n", + " data = data.replace('2d_embedding', 'landmarks')\n", + "with open('visualization/plumed.dat', 'w') as outfile:\n", + " outfile.writelines(data)\n", + "\n", + "# run with plumed\n", + "shutil.copy('clustering/outClusters.pdb', 'visualization/')\n", + "gmx_run('driver --plumed plumed.dat --mf_pdb outClusters.pdb', workdir='visualization')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# visualize configurations in contrast to representative configurations\n", "\n", "x = []\n", "y = []\n", "x1 = []\n", "y1 = []\n", - "with open(\"2d_embedding\", \"r\") as ifile:\n", - " for line in ifile.readlines()[1:]:\n", + "with open(\"visualization/2d_embedding\", \"r\") as infile:\n", + " for line in infile.readlines()[1:]:\n", " split_values = line.split()\n", " x.append(split_values[1])\n", " y.append(split_values[2])\n", - "with open(\"landmarks\", \"r\") as ifile:\n", - " for line in ifile.readlines()[1:]:\n", + "with open(\"visualization/landmarks\", \"r\") as infile:\n", + " for line in infile.readlines()[1:]:\n", " split_values = line.split()\n", " x1.append(split_values[1])\n", " y1.append(split_values[2])\n", " \n", - "plot_landmarks(x, y, x1, y1).show()" + " \n", + "plt.figure(figsize=(100, 50), dpi=100)\n", + "plt.scatter(x, y, c='b', marker='.', label='All Configurations', s=2000)\n", + "plt.scatter(x1, y1, c='r', marker='o', label='Representative configurations', s=3000)\n", + "plt.legend(loc='upper left', prop={'size': 80})\n", + "\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 5. Accurate and innacurate energy evaluation\n", - "### 5.1 AM1 method" + "## 4. Accurate energy computation\n", + "**Accurate energy** values are computed in this step using the [Orca](https://sites.google.com/site/orcainputlibrary/) quantum chemistry software. Orca uses [method files](https://sites.google.com/site/orcainputlibrary/generalinput) in exact format to perform the computations. Computation of exact energy values of each representative is composed of 3-step chain. Input for *AM1* are representative configurations defined by clustering in previous step. Each next computation is performed on the output of the previous step.\n", + "\n", + "1. **AM1 optimisation** (*input*: clustering results)\n", + "2. **BP86 SVP** (*input*: AM1 output)\n", + "3. **BP86 TZVP** (*input*: BP86 SVP output, *output*: exact energy values of representative configurations)\n", + "\n", + "Also each step above consists of 3 substeps:\n", + "\n", + "1. Convert the output of previous step to Orca compatible *.inp method*\n", + "2. Perform Orca computation\n", + "3. Analyse output (see logs)\n", + "\n", + "You can view the current state of Orca calculation in output logs (3rd substep) with for example *\\\"tail -f XXX\\\"* command. Please read the wrapper [documentation](https://github.com/CERIT-SC/pmcvff-correction/tree/jupyter-refactor/modules/gmx_orca) before interacting with any Orca command." ] }, { @@ -594,15 +686,14 @@ "metadata": {}, "outputs": [], "source": [ - "#fix indexing\n", - "#orca indexes from zero\n", - "\n", + "# fix indexing of atoms because Orca indexes from zero \n", "orca_torsions = [[] for _ in range(len(torsions))]\n", "torsion_index = 0\n", "for torsion in torsions:\n", " for atom_index in torsion:\n", " orca_torsions[torsion_index].append(atom_index - 1)\n", - " torsion_index += 1" + " torsion_index += 1\n", + "torsions = orca_torsions" ] }, { @@ -611,17 +702,102 @@ "metadata": {}, "outputs": [], "source": [ - "#1. convert\n", - "#orca method description - first line in orca method\n", - "method_description = \"!AM1 Opt\"\n", - "input_dir = \"clustering/outClustersXYZ/\"\n", - "output_dir = \"am1/input/\"\n", + "# initialize a list of all clusters that are considered\n", + "# in the quantum chemistry computations. For example\n", + "# non-converged simulation on cluster will result in\n", + "# discarding that cluster from further computations\n", + "#\n", + "# ** don't forget to reset clusters variable when\n", + "# rerunning computations as non-converged will be\n", + "# missing **\n", + "clusters = []\n", + "for cluster in os.listdir('clustering/outClustersPDB'):\n", + " if '.pdb' in cluster:\n", + " clusters.append(cluster.replace('.pdb', ''))\n", + " \n", + "\n", + "# convert .pdb file to orca method\n", + "# - method specifies which method to apply by Orca\n", + "# - info is to specify the *charge* and *spin* of molecule\n", + "# - nprocs is to specify the number of CPU's to use (None for default)\n", + "# - xyz specify True to convert .xyz to orca instead\n", + "def pdb2orca(pdb_in, orca_out, method, info, nprocs=None, xyz=False):\n", + " with open(pdb_in, 'r') as infile, open(orca_out, 'w') as outfile:\n", + " outfile.write(f'{method}\\n')\n", + " \n", + " if nprocs:\n", + " outfile.write(f'%pal\\n')\n", + " outfile.write(f'nprocs {str(nprocs)}\\n')\n", + " outfile.write(f'end\\n')\n", + " \n", + " outfile.write('%geom\\n')\n", + " outfile.write('Constraints\\n')\n", + " \n", + " # write torsions\n", + " for torsions_list in torsions:\n", + " outfile.write('{D ')\n", + " outfile.write(' '.join(str(x) for x in torsions_list))\n", + " outfile.write('C}\\n')\n", + " \n", + " outfile.write('end\\n')\n", + " outfile.write('end\\n\\n')\n", + " outfile.write(f'*xyz {info[0]} {info[1]}\\n')\n", + " \n", + " # copy atom information from input and modify\n", + " # to fit the orca method format\n", + " if xyz:\n", + " for line in infile.readlines()[2:]:\n", + " outfile.write(f'{line}')\n", + " else:\n", + " for line in infile.readlines():\n", + " splitline = line.split()\n", + " if splitline[0] == 'ATOM':\n", + " orca_line = f'{splitline[10]} {splitline[5]} {splitline[6]} {splitline[7]}\\n'\n", + " outfile.write(orca_line)\n", + " \n", + " outfile.write('*\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1 AM1 geometry optimisation method\n", + "Perform the semi-empirical [Austin Model 1](https://en.wikipedia.org/wiki/Austin_Model_1) method on representatives. It can be understood as kind of preprocessing (optimisation) which uses approximations instead of exact calculations to speed up the process of the next simulation. Output of AM1 is input for the next [BP86 SVP method.](###-4.3-BP86-SVP-method)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert the representatives from clustering to fit \n", + "# the AM1 Orca method\n", + "\n", + "# specify AM1 method\n", + "method = '!AM1 Opt'\n", + "\n", + "# input directory of .pdb representatives\n", + "input_dir = 'clustering/outClustersPDB/'\n", + "\n", + "# output directory where converted .inp Orca \n", + "# methods will be placed\n", + "output_dir = 'am1/input/'\n", + "\n", + "# specify spin (and charge)\n", "spin = 1\n", + "# charge is specified in the first step \n", + "# of 1. Molecule shape processing\n", + "# charge = XXX\n", + "\n", "\n", - "!mkdir -p -m 757 am1 ; mkdir -p -m 757 am1/input ; mkdir -p -m 757 am1/output\n", - "!mkdir -p -m 757 orca_output ; mkdir -p -m 757 orca_output/am1 ; mkdir -p -m 757 orca_output/bp86svp ; mkdir -p -m 757 orca_output/bp86tzvp\n", + "for pdb in tqdm(os.listdir(input_dir)):\n", + " if '.pdb' in pdb:\n", + " infile = input_dir + pdb\n", + " outfile = output_dir + pdb.replace('pdb', 'inp')\n", "\n", - "convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, -1, (charge, spin))" + " pdb2orca(infile, outfile, method, (charge,spin))" ] }, { @@ -630,24 +806,24 @@ "metadata": {}, "outputs": [], "source": [ - "#2. compute\n", - "#minimisation of conformations before quantum mechanics\n", - "#see logs in orca_output directory...\n", + "# run the AM1 method optimisation\n", + "\n", "input_path = \"am1/input/\"\n", "output_path = \"am1/output/\"\n", - "successfuly_converged = 0\n", - "clusters_count = len(clusters_ids)\n", - "\n", - "for orca_method in tqdm(os.listdir(input_path)):\n", - " if not orca_method.endswith(\".inp\"):\n", - " continue\n", - " orca_method_name = orca_method.replace(\".inp\", \"\")\n", - " orca_method_number = orca_method_name[len(\"outCluster\"):]\n", - " output_dir_path = output_path + orca_method_name + \"/\"\n", - " if not os.path.exists(output_dir_path):\n", - " !mkdir -p -m 757 {output_dir_path}\n", - " shutil.copy(input_path + orca_method, output_dir_path)\n", - " orca_run(orca_method, f\"orca_output/am1/orca_output{orca_method_number}.out\", workdir=output_dir_path, parallel=True)\n", + "\n", + "\n", + "for method_file in os.listdir(input_path):\n", + " if '.inp' in method_file:\n", + " log_file = method_file.replace('inp', 'out')\n", + " cluster_dir = method_file.replace('.inp', '/')\n", + " cluster_dir_path = output_path + cluster_dir\n", + "\n", + " if not os.path.exists(cluster_dir_path):\n", + " os.mkdir(cluster_dir_path) \n", + " shutil.copy(input_path + method_file, cluster_dir_path)\n", + "\n", + " orca_run(method_file, cluster_dir_path + log_file, workdir=cluster_dir_path, parallel=True)\n", + " \n", "parallel_wait()" ] }, @@ -657,22 +833,22 @@ "metadata": {}, "outputs": [], "source": [ - "#3. check output\n", - "#check output of orca methods\n", - "\n", - "for cluster_id in clusters_ids:\n", - " with open(f\"orca_output/am1/orca_output{cluster_id}.out\") as iorca_output:\n", - " iorca_output_text = iorca_output.read()\n", - " if \"****ORCA TERMINATED NORMALLY****\" not in iorca_output_text:\n", - " print(iorca_output_text)\n", - " raise SystemExit(f\"Error in am1 method of cluster no. {cluster_id}. You should not continue computation!\")" + "# check the output of AM1 optimisation\n", + "\n", + "for cluster in clusters:\n", + " with open(f'{output_path}{cluster}/{cluster}.out') as infile:\n", + " orca_log = infile.read()\n", + " if '****ORCA TERMINATED NORMALLY****' not in orca_log:\n", + " print(orca_log)\n", + " raise SystemExit(f'Error in AM1 method of {cluster}. You should not continue computation!')\n", + "print(f'AM1 has been successfully finished on all clusters. You can view logs at \"{output_path}\"')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 5.2 BP86 SVP method" + "### 4.2 BP86 SVP General gradient approximation method" ] }, { @@ -681,20 +857,31 @@ "metadata": {}, "outputs": [], "source": [ - "#1. convert\n", - "#orca method description - first line in orca method\n", - "method_description = \"!BP86 def2-SVP TightSCF Opt\"\n", - "output_dir = \"bp86svp/input/\"\n", + "# specify BP86 method\n", + "method = '!BP86 def2-SVP TightSCF Opt'\n", + "\n", + "# input directory of .pdb representatives\n", + "input_dir = 'am1/output/'\n", + "\n", + "# output directory where converted .inp Orca \n", + "# methods will be placed\n", + "output_dir = 'bp86svp/input/'\n", + "\n", + "# number of CPUs to use\n", + "nprocs = 12\n", + "\n", + "# specify spin (and charge)\n", "spin = 1\n", - "required_cpus = 12\n", + "# charge is specified in the first step \n", + "# of 1. Molecule shape processing\n", + "# charge = XXX\n", "\n", - "#orca method description - first line in orca method\n", - "method_description = \"!BP86 def2-SVP TightSCF Opt\"\n", - "!mkdir -p -m 757 bp86svp ; mkdir -p -m 757 bp86svp/input ; mkdir -p -m 757 bp86svp/output\n", "\n", - "for i in clusters_ids:\n", - " input_dir = \"am1/output/outCluster{}/\".format(i)\n", - " convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, required_cpus, (charge, spin))" + "for cluster in clusters:\n", + " infile = f'{input_dir}{cluster}/{cluster}.xyz'\n", + " outfile = f'{output_dir}{cluster}.inp'\n", + " \n", + " pdb2orca(infile, outfile, method, (charge,spin), nprocs=nprocs, xyz=True)" ] }, { @@ -703,24 +890,24 @@ "metadata": {}, "outputs": [], "source": [ - "#2. compute\n", - "#minimisation of conformations before quantum mechanics\n", - "#see logs in orca_output directory...\n", - "input_path = \"bp86svp/input/\"\n", - "output_path = \"bp86svp/output/\"\n", - "successfuly_converged = 0\n", - "clusters_count = len(clusters_ids)\n", - "\n", - "for orca_method in tqdm(os.listdir(input_path)):\n", - " if not orca_method.endswith(\".inp\"):\n", - " continue\n", - " orca_method_name = orca_method.replace(\".inp\", \"\")\n", - " orca_method_number = orca_method_name[len(\"outCluster\"):]\n", - " output_dir_path = output_path + orca_method_name + \"/\"\n", - " if not os.path.exists(output_dir_path):\n", - " !mkdir -p -m 757 {output_dir_path}\n", - " shutil.copy(input_path + orca_method, output_dir_path)\n", - " orca_run(orca_method, f\"orca_output/bp86svp/orca_output{orca_method_number}.out\", workdir=output_dir_path, parallel=True)\n", + "# run the BP86 General gradient approximation method\n", + "\n", + "input_path = 'bp86svp/input/'\n", + "output_path = 'bp86svp/output/'\n", + "\n", + "\n", + "for method_file in os.listdir(input_path):\n", + " if '.inp' in method_file:\n", + " log_file = method_file.replace('inp', 'out')\n", + " cluster_dir = method_file.replace('.inp', '/')\n", + " cluster_dir_path = output_path + cluster_dir\n", + "\n", + " if not os.path.exists(cluster_dir_path):\n", + " os.mkdir(cluster_dir_path) \n", + " shutil.copy(input_path + method_file, cluster_dir_path)\n", + "\n", + " orca_run(method_file, cluster_dir_path + log_file, workdir=cluster_dir_path, parallel=True)\n", + " \n", "parallel_wait()" ] }, @@ -730,24 +917,26 @@ "metadata": {}, "outputs": [], "source": [ - "#3. check output && discard non converging clusters\n", - "for cluster_id in clusters_ids:\n", - " with open(f\"orca_output/bp86svp/orca_output{cluster_id}.out\") as iorca_output:\n", - " iorca_output_text = iorca_output.read()\n", - " if \"****ORCA TERMINATED NORMALLY****\" not in iorca_output_text:\n", - " !{orca_job_check} orca_output/bp86svp/orca_output{cluster_id}.out\n", - " clusters_ids.remove(int(orca_method_number))\n", - " else:\n", - " successfuly_converged += 1\n", - " \n", - "print(f\"** {successfuly_converged}/{clusters_count} successfuly converged - unconverged are not considered in next steps **\")" + "# check output && discard non-converging clusters\n", + "number_of_clusters = len(clusters)\n", + "\n", + "for cluster in clusters:\n", + " log_file = f'{output_path}{cluster}/{cluster}.out'\n", + " with open(log_file) as infile:\n", + " log = infile.read()\n", + " if '****ORCA TERMINATED NORMALLY****' not in log:\n", + " !{orca_job_check} {log_file}\n", + " clusters.remove(cluster)\n", + "\n", + "print(f'''{len(clusters)}/{number_of_clusters} successfully converged - unconverged are not considered\n", + " in next steps.\\n You can view logs at \"{output_path}\" directory''')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 5.3 BP86 TZVP method" + "### 4.3 BP86 TZVP General gradient approximation method" ] }, { @@ -756,20 +945,31 @@ "metadata": {}, "outputs": [], "source": [ - "#1. convert\n", - "#orca method description - first line in orca method\n", - "method_description = \"!BP86 def2-TZVP TightSCF Opt\"\n", - "output_dir = \"bp86tzvp/input/\"\n", + "# specify BP86 method\n", + "method = '!BP86 def2-TZVP TightSCF Opt'\n", + "\n", + "# input directory of .pdb representatives\n", + "input_dir = 'bp86svp/output/'\n", + "\n", + "# output directory where converted .inp Orca \n", + "# methods will be placed\n", + "output_dir = 'bp86tzvp/input/'\n", + "\n", + "# number of CPUs to use\n", + "nprocs = 12\n", + "\n", + "# specify spin (and charge)\n", "spin = 1\n", - "required_cpus = 12\n", + "# charge is specified in the first step \n", + "# of 1. Molecule shape processing\n", + "# charge = XXX\n", "\n", - "#orca method description - first line in orca method\n", - "method_description = \"!BP86 def2-TZVP TightSCF Opt\"\n", - "!mkdir -p -m 757 bp86tzvp ; mkdir -p -m 757 bp86tzvp/input ; mkdir -p -m 757 bp86tzvp/output\n", "\n", - "for i in clusters_ids:\n", - " input_dir = \"bp86svp/output/outCluster{}/\".format(i)\n", - " convert_to_orca_methods(input_dir, output_dir, orca_torsions, method_description, required_cpus, (charge, spin))" + "for cluster in clusters:\n", + " infile = f'{input_dir}{cluster}/{cluster}.xyz'\n", + " outfile = f'{output_dir}{cluster}.inp'\n", + " \n", + " pdb2orca(infile, outfile, method, (charge,spin), nprocs=nprocs, xyz=True)" ] }, { @@ -778,24 +978,24 @@ "metadata": {}, "outputs": [], "source": [ - "#2. compute\n", - "#run quantum mechanics\n", - "#see logs in orca_output directory...\n", - "input_path = \"bp86tzvp/input/\"\n", - "output_path = \"bp86tzvp/output/\"\n", - "successfuly_converged = 0\n", - "clusters_count = len(clusters_ids)\n", - "\n", - "for orca_method in tqdm(os.listdir(input_path)):\n", - " if not orca_method.endswith(\".inp\"):\n", - " continue\n", - " orca_method_name = orca_method.replace(\".inp\", \"\")\n", - " orca_method_number = orca_method_name[len(\"outCluster\"):]\n", - " output_dir_path = output_path + orca_method_name + \"/\"\n", - " if not os.path.exists(output_dir_path):\n", - " !mkdir -p -m 757 {output_dir_path}\n", - " shutil.copy(input_path + orca_method, output_dir_path)\n", - " orca_run(orca_method, f\"orca_output/bp86tzvp/orca_output{orca_method_number}.out\", workdir=output_dir_path, parallel=True)\n", + "# run the BP86 TZVP General gradient approximation method\n", + "\n", + "input_path = 'bp86tzvp/input/'\n", + "output_path = 'bp86tzvp/output/'\n", + "\n", + "\n", + "for method_file in os.listdir(input_path):\n", + " if '.inp' in method_file:\n", + " log_file = method_file.replace('inp', 'out')\n", + " cluster_dir = method_file.replace('.inp', '/')\n", + " cluster_dir_path = output_path + cluster_dir\n", + "\n", + " if not os.path.exists(cluster_dir_path):\n", + " os.mkdir(cluster_dir_path) \n", + " shutil.copy(input_path + method_file, cluster_dir_path)\n", + "\n", + " orca_run(method_file, cluster_dir_path + log_file, workdir=cluster_dir_path, parallel=True)\n", + " \n", "parallel_wait()" ] }, @@ -805,17 +1005,19 @@ "metadata": {}, "outputs": [], "source": [ - "#3. check output && discard non converging clusters\n", - "for cluster_id in clusters_ids:\n", - " with open(f\"orca_output/bp86tzvp/orca_output{cluster_id}.out\") as iorca_output:\n", - " iorca_output_text = iorca_output.read()\n", - " if \"****ORCA TERMINATED NORMALLY****\" not in iorca_output_text:\n", - " !{orca_job_check} orca_output/bp86tzvp/orca_output{cluster_id}.out\n", - " clusters_ids.remove(int(cluster_id))\n", - " else:\n", - " successfuly_converged += 1\n", - " \n", - "print(f\"** {successfuly_converged}/{clusters_count} successfuly converged - unconverged are not considered in next steps **\") " + "# check output && discard non-converging clusters\n", + "number_of_clusters = len(clusters)\n", + "\n", + "for cluster in clusters:\n", + " log_file = f'{output_path}{cluster}/{cluster}.out'\n", + " with open(log_file) as infile:\n", + " log = infile.read()\n", + " if '****ORCA TERMINATED NORMALLY****' not in log:\n", + " !{orca_job_check} {log_file}\n", + " clusters.remove(cluster)\n", + "\n", + "print(f'''{len(clusters)}/{number_of_clusters} successfully converged - unconverged are not considered\n", + " in next steps.\\n You can view logs at \"{output_path}\" directory''')" ] }, { @@ -824,16 +1026,17 @@ "metadata": {}, "outputs": [], "source": [ - "#extract final energies from output of orca\n", - "with open(\"orca_energies.txt\", \"a\") as ofile:\n", - " for orca_output in os.listdir(\"orca_output/bp86tzvp\"): \n", - " with open(\"orca_output/bp86tzvp/\" + orca_output) as iorca_output:\n", - " for line in reversed(list(iorca_output)):\n", - " energy_list = re.findall(r'(FINAL SINGLE POINT ENERGY)( +)(-?\\d+\\.\\d+)', line)\n", - " if len(energy_list) > 0:\n", - " ofile.write(energy_list[0][2])\n", - " ofile.write('\\n')\n", - " break" + "# extract final energies from output of Orca TZVP method\n", + "# note: energies are in hartree unit\n", + "orca_energies = {}\n", + "\n", + "for cluster in clusters:\n", + " with open(f'bp86tzvp/output/{cluster}/{cluster}.out') as infile:\n", + " for line in reversed(list(infile)):\n", + " energy_list = re.findall(r'(FINAL SINGLE POINT ENERGY)( +)(-?\\d+\\.\\d+)', line)\n", + " if len(energy_list) > 0:\n", + " orca_energies[cluster] = float(energy_list[0][2])\n", + " break" ] }, { @@ -842,24 +1045,24 @@ "metadata": {}, "outputs": [], "source": [ - "#convert from hartree to kJ/mol and write quantum mechanics energies to file\n", + "# convert from hartree to kJ/mol\n", "\n", - "energies_in_hartree = []\n", - "with open(\"orca_energies.txt\", \"r\") as ifile:\n", - " for line in ifile.readlines():\n", - " energies_in_hartree.append(float(line))\n", - "\n", - "#constant to convert from hartree to kJ/mol\n", "CONVERSION_CONST = 2625.499638\n", - "minimum = min(energies_in_hartree)\n", + "min_energy = min(list(orca_energies.values()))\n", "\n", - "energies_in_kJ = []\n", - "for H in energies_in_hartree:\n", - " energies_in_kJ.append((H-minimum)*CONVERSION_CONST)\n", + "for cluster, energy in orca_energies.items():\n", + " orca_energies[cluster] = (energy-min_energy)*CONVERSION_CONST" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Inaccurate energy computation\n", + "**Inaccurate energy** values are computed via Gromacs. To compute energy values we use structures whose geometry was optimised by Orca. Afterwards dihedrals are computed from optimised structures' trajectory by [Plumed](https://www.plumed.org/). Finally using dihedrals compute inaccurate energy values.\n", "\n", - "with open(\"orca_energies.txt\", \"w\") as ofile:\n", - " for energy in energies_in_kJ:\n", - " ofile.write(\"{}\\n\".format(energy))" + "### 5.1 Convert optimised .xyz files to .pdb format\n", + "Convert optimised structures to corresponding *.pdb* files by combining information about atoms from cluster representative *.pdb* file and xyz coordinates from optimised *.xyz* file. " ] }, { @@ -868,32 +1071,42 @@ "metadata": {}, "outputs": [], "source": [ - "#copy atoms from conformation before QM and combine it with orca optimised conformations\n", + "input_dir = 'bp86tzvp/output/'\n", + "output_dir = 'pdb_opt/'\n", "\n", - "output_dir = \"pdb_opt/\"\n", - "input_dir = \"bp86tzvp/output/outCluster\"\n", - "\n", - "!mkdir -p {output_dir}\n", "\n", + "# get information about atoms from cluster representative\n", + "# (frist 26 columns - see .pdb format details for details)\n", "atoms = []\n", - "with open(\"clustering/outClustersPDB/outCluster0.pdb\", \"r\") as ifile:\n", - " for line in ifile.readlines():\n", - " if \"ATOM\" in line:\n", - " atoms.append(line[:26])\n", - "\n", - "for i in clusters_ids:\n", - " hetatms = []\n", - " !babel -ixyz {input_dir}{i}/outCluster{i}.xyz -opdb {output_dir}temp_cluster_{i}.pdb\n", - " with open(\"{}temp_cluster_{}.pdb\".format(output_dir, i), \"r\") as ifile:\n", - " for line in ifile.readlines():\n", - " if \"HETATM\" in line:\n", - " hetatms.append(line[27:66])\n", - " with open(\"pdb_opt/cluster{}.pdb\".format(i), \"w\") as output_cluster:\n", - " for j in range(len(atoms)):\n", - " output_cluster.write(atoms[j])\n", - " output_cluster.write(hetatms[j] + \"\\n\")\n", - "\n", - "!rm {output_dir}/temp_*" + "with open('clustering/outClustersPDB/outCluster0.pdb', 'r') as infile:\n", + " for line in infile.readlines():\n", + " if \"ATOM\" in line:\n", + " atoms.append(line[:26])\n", + "\n", + " \n", + "# combine atoms from .pdb with optimised coordinates from .xyz\n", + "for cluster in clusters:\n", + " with open(f'{input_dir}{cluster}/{cluster}.xyz', 'r') as infile, \\\n", + " open(f'{output_dir}{cluster}_opt.pdb', 'w') as outfile:\n", + " xyz = infile.readlines()[2:]\n", + " for i in range(len(atoms)):\n", + " split_line = xyz[i].split()\n", + " \n", + " # fix spaces near numbers with '-' to fit .pdb format\n", + " # and print only 3 decimal places\n", + " for j in range(1, len(split_line)):\n", + " split_line[j] = f'{round(float(split_line[j]), 3):.3f}'\n", + " if split_line[j][0] != '-':\n", + " split_line[j] = f' {split_line[j]}'\n", + " pdb_line = f'{atoms[i]} {split_line[1]} {split_line[2]} {split_line[3]} 1.00 0.00\\n'\n", + " outfile.write(pdb_line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 Compute dihedrals" ] }, { @@ -902,33 +1115,35 @@ "metadata": {}, "outputs": [], "source": [ - "input_dir = \"pdb_opt/\"\n", - "output_file = \"clusters.pdb\"\n", - "\n", - "#concatinate optimised conformations to trajectory\n", - "with open(output_file, \"w\") as ofile:\n", - " for i in clusters_ids:\n", - " ofile.write(\"MODEL {}\\n\".format(str(i)))\n", - " with open(\"{}cluster{}.pdb\".format(input_dir, i), \"r\") as ifile:\n", - " ofile.write(ifile.read())\n", - " ofile.write(\"ENDMDL\\n\")\n", - "\n", - "with open(\"plumed.dat\", \"w\") as ifile:\n", + "# concatinate optimised structures to a trajectory\n", + "# so it can be processed by Plumed\n", + "with open('clusters_opt.pdb', 'w') as outfile:\n", + " for cluster in clusters:\n", + " outfile.write(f'MODEL {cluster[len(cluster)-1]}\\n')\n", + " with open(f'pdb_opt/{cluster}_opt.pdb', 'r') as infile:\n", + " outfile.write(infile.read())\n", + " outfile.write('ENDMDL\\n')\n", + "\n", + "\n", + "# create plumed file for Plumed processing\n", + "with open('plumed.dat', 'w') as infile:\n", " cvs = []\n", - " num_atoms = sum(1 for line in open(\"{}cluster{}.pdb\".format(input_dir, clusters_ids[0])))\n", - " ifile.write(\"WHOLEMOLECULES ENTITY0=1-{}\\n\".format(str(num_atoms)))\n", + " infile.write(f'WHOLEMOLECULES ENTITY0=1-{str(natoms)}\\n')\n", " for i in range(0, len(torsions)):\n", - " cvs.append(\"cv{}\".format(i))\n", - " ifile.write(\"TORSION ATOMS=\")\n", - " ifile.write(\",\".join(str(x) for x in torsions[i]))\n", - " ifile.write(\" LABEL={}\\n\".format(cvs[i]))\n", - " ifile.write(\"PRINT ARG=\")\n", - " ifile.write(\",\".join(cvs))\n", - " ifile.write(\" STRIDE=1 FILE=DIHEDRALS\")\n", + " cvs.append(f'cv{i}')\n", + " infile.write('TORSION ATOMS=')\n", + " infile.write(','.join(str(x) for x in torsions[i]))\n", + " infile.write(f' LABEL={cvs[i]}\\n')\n", + " infile.write('PRINT ARG=')\n", + " infile.write(','.join(cvs))\n", + " infile.write(' STRIDE=1 FILE=DIHEDRALS')\n", + " \n", + "\n", + "# compute dihedrals (produces output file DIHEDRALS)\n", + "gmx_run(f'driver --plumed plumed.dat --mf_pdb clusters_opt.pdb')\n", "\n", - "#compute dihedrals\n", - "gmx_run(f\"driver --plumed plumed.dat --mf_pdb {output_file}\")\n", "\n", + "# remove all # lines, keep only numbers\n", "lines = []\n", "with open(\"DIHEDRALS\", \"r\") as ifile:\n", " for line in ifile.readlines():\n", @@ -939,50 +1154,64 @@ " ofile.write(line)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 Compute inaccurate energy values" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#perform minimisations and compute GAFF energy\n", - "\n", - "cvs = [[] for _ in range(len(clusters_ids))]\n", - "with open(\"DIHEDRALS\",\"r\") as ifile:\n", - " dihedrals = ifile.readlines()\n", + "cvs = [[] for _ in range(len(clusters))]\n", + "with open('DIHEDRALS','r') as infile:\n", + " dihedrals = infile.readlines()\n", " for i in range(len(dihedrals)):\n", " t_angles = dihedrals[i].split()\n", " for j in range(len(torsions)):\n", " cvs[i].append(float(t_angles[j+1])*(180/math.pi))\n", "\n", - "def generate_restraint(cluster_i):\n", - " with open(\"MOL_GMX.top\", \"r\") as ifile, open(\"gaff/cluster_{}/restrained.top\".format(str(cluster_i)), \"w\") as ofile:\n", - " for line in ifile.readlines():\n", + " \n", + "def generate_restraint(cluster):\n", + " with open('MOL_GMX.top', 'r') as infile, \\\n", + " open(f'gaff/{cluster}/restrained.top', \"w\") as outfile:\n", + " for line in infile.readlines():\n", " if line == \"; Ligand position restraints\\n\":\n", - " ofile.write(\"\\n\")\n", - " ofile.write(\"[ dihedral_restraints ]\\n\")\n", + " outfile.write(\"\\n\")\n", + " outfile.write(\"[ dihedral_restraints ]\\n\")\n", " for j in range(len(torsions)):\n", - " ofile.write(\" \".join(torsions[j]))\n", - " ofile.write(\"2 %3.1f 0 500\\n\" %cvs[i][j])\n", - " ofile.write(line)\n", - "!mkdir -p -m 757 gaff\n", - "\n", - "#select groups for energy evaluation\n", + " outfile.write(\" \".join(torsions[j]))\n", + " outfile.write(\"2 %3.1f 0 500\\n\" %cvs[i][j])\n", + " outfile.write(line)\n", + "\n", + " \n", + "# select groups for energy evaluation\n", + "# Group    0 (        System)\n", + "# Group    1 (         Other)\n", + "# Group    2 (           MOL)\n", + "# Group    3 (        Custom)\n", "groups = \"10\"\n", "\n", - "for i in tqdm(clusters_ids):\n", - " !mkdir -p -m 757 gaff/cluster_{i}\n", - " shutil.copy(\"pdb_opt/cluster{}.pdb\".format(i), \"gaff/cluster_{}\".format(i))\n", - " shutil.copy(\"MOL_GMX.top\", \"gaff/cluster_{}\".format(i))\n", - " shutil.copy(\"em.mdp\", \"gaff/cluster_{}\".format(i))\n", - " shutil.copy(\"md.mdp\", \"gaff/cluster_{}\".format(i))\n", - " generate_restraint(i)\n", - " gmx_run(f\"editconf -f cluster{i}.pdb -box 3 3 3 -bt cubic -c -o box.gro\", workdir=f\"/gaff/cluster_{i}\")\n", - " gmx_run(\"grompp -f em -c box.gro -p restrained.top -o em1\", workdir=f\"/gaff/cluster_{i}\")\n", - " gmx_run(\"mdrun -ntomp 2 -s em1 -c after_em1 -g em1 -e em1 -o em1\", workdir=f\"/gaff/cluster_{i}\")\n", - " gmx_run(\"grompp -f md -c box.gro -p MOL_GMX.top -o rerun\", workdir=f\"/gaff/cluster_{i}\")\n", - " gmx_run(\"mdrun -ntomp 2 -s rerun -rerun em1 -c after_rerun -g rerun -e rerun -o rerun\", workdir=f\"/gaff/cluster_{i}\")\n", - " gmx_run(\"energy -f rerun.edr -o rerun.xvg\", workdir=f\"/gaff/cluster_{i}\", groups=groups)" + "for cluster in tqdm(clusters):\n", + " cluster_workdir = f'gaff/{cluster}'\n", + " if not os.path.exists(cluster_workdir):\n", + " os.mkdir(cluster_workdir)\n", + " \n", + " shutil.copy(f'pdb_opt/{cluster}_opt.pdb', cluster_workdir)\n", + " shutil.copy('MOL_GMX.top', cluster_workdir)\n", + " shutil.copy('em.mdp', cluster_workdir)\n", + " shutil.copy('md.mdp', cluster_workdir)\n", + " generate_restraint(cluster)\n", + " gmx_run(f'editconf -f {cluster}_opt.pdb -box 3 3 3 -bt cubic -c -o box.gro', workdir=cluster_workdir)\n", + " gmx_run('grompp -f em -c box.gro -p restrained.top -o em1', workdir=cluster_workdir)\n", + " gmx_run('mdrun -ntomp 2 -s em1 -c after_em1 -g em1 -e em1 -o em1', workdir=cluster_workdir)\n", + " gmx_run('grompp -f md -c box.gro -p MOL_GMX.top -o rerun', workdir=cluster_workdir)\n", + " gmx_run('mdrun -ntomp 2 -s rerun -rerun em1 -c after_rerun -g rerun -e rerun -o rerun', workdir=cluster_workdir)\n", + " gmx_run('energy -f rerun.edr -o rerun.xvg', workdir=cluster_workdir, groups=groups)" ] }, { @@ -991,20 +1220,22 @@ "metadata": {}, "outputs": [], "source": [ - "#extract energies and from each energy value subtract minimal energy\n", + "# extract gaff energies and from each energy value \n", + "# subtract minimal energy\n", "\n", - "energies_lst = []\n", + "gaff_energies = {}\n", "\n", - "for i in clusters_ids:\n", - " with open(\"gaff/cluster_{}/rerun.xvg\".format(i, \"r\")) as ifile:\n", - " last_line = ifile.readlines()[-1]\n", - " energies = last_line.split(\" \")\n", - " energies_lst.append(energies[len(energies) - 1].rstrip())\n", + "for cluster in clusters:\n", + " with open(f'gaff/{cluster}/rerun.xvg', 'r') as infile:\n", + " last_line = infile.readlines()[-1]\n", + " energies = last_line.split(' ')\n", + " gaff_energies[cluster] = energies[len(energies) - 1].rstrip()\n", "\n", - "min_energy = min(energies_lst)\n", - "with open(\"gaff_energies.txt\", \"w\") as ofile:\n", - " for energy in energies_lst:\n", - " ofile.write(\"{} \\n\".format((float(energy) - float(min_energy))))" + " \n", + "min_energy = min(list(gaff_energies.values()))\n", + "\n", + "for cluster, energy in gaff_energies.items():\n", + " gaff_energies[cluster] = float(energy) - float(min_energy)" ] }, { @@ -1020,33 +1251,14 @@ "metadata": {}, "outputs": [], "source": [ - "#subtract from orca energies computed in step 5, GAFF energies computed in step 6\n", - "\n", - "with open(\"reference\", \"w\") as ofile:\n", - " with open(\"orca_energies.txt\", \"r\") as orca_energies, open(\"gaff_energies.txt\", \"r\") as gaff_energies:\n", - " for orca_energy in orca_energies.readlines():\n", - " for gaff_energy in gaff_energies.readlines():\n", - " ofile.write(\"{} \\n\".format((float(orca_energy) - float(gaff_energy))))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#write final corrections into output file along with representative conformations\n", - "\n", - "input_dir = \"pdb_opt/\"\n", - "\n", - "with open(\"reference.pdb\", \"w\") as ofile, open(\"reference\", \"r\") as ifile:\n", - " final_energies = ifile.readlines()\n", - " energy_index = 0\n", - " for i in clusters_ids:\n", - " ofile.write(\"REMARK X={}\".format(final_energies[energy_index]))\n", - " with open(\"{}cluster{}.pdb\".format(input_dir, i), \"r\") as ifile1:\n", - " ofile.writelines(ifile1.readlines())\n", - " energy_index += 1" + "# write corrected energy to final refernce.pdb file \n", + "\n", + "with open('reference.pdb', 'w') as outfile:\n", + " for cluster in clusters:\n", + " corrected_energy = orca_energies[cluster] - gaff_energies[cluster]\n", + " outfile.write(f'REMARK X={corrected_energy}\\n')\n", + " with open(f'pdb_opt/{cluster}_opt.pdb', 'r') as infile:\n", + " outfile.writelines(infile.readlines())" ] }, { @@ -1084,7 +1296,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.3" } }, "nbformat": 4,