python实现自动重排分子原子标号

Word count: 2.2kReading time: 10 min

 2023/03/13 

此文主要介绍作者用python实现自动重新排序的脚本

1.依赖 python >= 3.7
numpy 矩阵计算库
scipy 数学计算库
re 正则匹配库

主体函数如下所示


def hungarian(A, B):
    """
    Hungarian reordering.
    Assume A and B are coordinates for atoms of SAME type only
    """

    # should be kabasch here i think
    distances = cdist(A, B, "euclidean")

    # Perform Hungarian analysis on distance matrix between atoms of 1st
    # structure and trial structure
    indices_a, indices_b = linear_sum_assignment(distances)

    return indices_b


def reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord):
    """
    Re-orders the input atom list and xyz coordinates using the Hungarian
    method (using optimized column results)
    Parameters
    ----------
    p_atoms : array
        (N,1) matrix, where N is points holding the atoms' names
    p_atoms : array
        (N,1) matrix, where N is points holding the atoms' names
    p_coord : array
        (N,D) matrix, where N is points and D is dimension
    q_coord : array
        (N,D) matrix, where N is points and D is dimension
    Returns
    -------
    view_reorder : array
             (N,1) matrix, reordered indexes of atom alignment based on the
             coordinates of the atoms
    """

    # Find unique atoms
    unique_atoms = np.unique(p_atoms)

    # generate full view from q shape to fill in atom view on the fly
    view_reorder = np.zeros(q_atoms.shape, dtype=int)
    view_reorder -= 1

    for atom in unique_atoms:
        (p_atom_idx,) = np.where(p_atoms == atom)
        (q_atom_idx,) = np.where(q_atoms == atom)

        A_coord = p_coord[p_atom_idx]
        B_coord = q_coord[q_atom_idx]

        view = hungarian(A_coord, B_coord)
        view_reorder[p_atom_idx] = q_atom_idx[view]

    return view_reorder

接下来我们需要使用这两个函数对不同序号的分子尽心重新排列，首先准备两个测试使用的苯分子的xyz文件如下所示：

6
coordinate 1
C                  0.00000000    0.00000000    0.00000000
C                  1.39516000    0.00000000    0.00000000
C                  2.09269800    1.20775100    0.00000000
C                  1.39504400    2.41626000   -0.00119900
C                  0.00021900    2.41618200   -0.00167800
C                 -0.69738200    1.20797600   -0.00068200
H                 -0.54975900   -0.95231700    0.00045000
H                  1.94466800   -0.95251300    0.00131500
H                  3.19237800    1.20783100    0.00063400
H                  1.94524400    3.36840300   -0.00125800
H                 -0.54990300    3.36846300   -0.00263100
H                 -1.79698600    1.20815900   -0.00086200


6
coordinate2
C                  0.00000000    0.00000000    0.00000000
C                  2.09269800    1.20775100    0.00000000
C                  1.39516000    0.00000000    0.00000000
C                  1.39504400    2.41626000   -0.00119900
C                  0.00021900    2.41618200   -0.00167800
C                 -0.69738200    1.20797600   -0.00068200
H                 -0.54975900   -0.95231700    0.00045000
H                  1.94466800   -0.95251300    0.00131500
H                  3.19237800    1.20783100    0.00063400
H                  1.94524400    3.36840300   -0.00125800
H                 -0.54990300    3.36846300   -0.00263100
H                 -1.79698600    1.20815900   -0.00086200

可以将上述两个文件粘贴复制到文件中命名为c6h6.xyz和c6h62.xyz，首先我们需要将xyz文件中的构型和分子的标号读取到矩阵中，程序中使用python的numpy库来保存作坐标和原子标号信息。这里我将给出一个简单实例如何从xyz文件中读取结构。


import re #引入正则匹配库，正则匹配库是python自带的库不需要额外安装
import numpy #引入numpy矩阵库，我们需要使用numpy库来保存矩阵，以及尽心矩阵计算


def read_coordinate(filename):#定义一个函数，其中参数需要一个文件名，这里我们要输入的是xyz文件
    atoms = [] #定义一个列表，用来报错分子的标号
    coordinate = [] #定义一个列表用来保存每一个原子的xyz坐标
    with open(filename,"r") as file: #打开xyz文件
            regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)#使用正则匹配文件中每一行的字符的格式，这里是一个通用的正则匹配形式，匹配每一行xyz坐标以及分子的标号
            if regex:
                #这里我们一下结果
                print(line)


filename1 = "c6h6.xyz"
read_coordinate(filename1)

运行以上代码的到结果如下所示

C                  0.00000000    0.00000000    0.00000000

C                  1.39516000    0.00000000    0.00000000

C                  2.09269800    1.20775100    0.00000000

C                  1.39504400    2.41626000   -0.00119900

C                  0.00021900    2.41618200   -0.00167800

C                 -0.69738200    1.20797600   -0.00068200

H                 -0.54975900   -0.95231700    0.00045000

H                  1.94466800   -0.95251300    0.00131500

H                  3.19237800    1.20783100    0.00063400

H                  1.94524400    3.36840300   -0.00125800

H                 -0.54990300    3.36846300   -0.00263100

H                 -1.79698600    1.20815900   -0.00086200

注意以上结果只是匹配到的字符，因此接下来我门需要将这些匹配的字符保存在矩阵中以便使用。我们继续完善读取代码如下：


import re #引入正则匹配库，正则匹配库是python自带的库不需要额外安装
import numpy as np#引入numpy矩阵库，我们需要使用numpy库来保存矩阵，以及尽心矩阵计算


def read_coordinate(filename):#定义一个函数，其中参数需要一个文件名，这里我们要输入的是xyz文件
    atoms = [] #定义一个列表，用来报错分子的标号
    coordinate = [] #定义一个列表用来保存每一个原子的xyz坐标
    with open(filename,"r") as file: #打开xyz文件
            regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)#使用正则匹配文件中每一行的字符的格式，这里是一个通用的正则匹配形式，匹配每一行xyz坐标以及分子的标号
            if regex:
                #这里我们一下结果
                #print(line)
                coordiante.append([float(i) for i in regex.group().split()[1:4]])#使用列表推导
                atoms.append(regex.group().split()[0])
    coordiante = np.array(coordiante)#将坐标保存到矩阵中
    atoms = np.array(atoms)#将分子标号保存到矩阵中
    return atoms，coordiante #返回分子标号矩阵，和坐标矩阵

filename1 = "c6h6.xyz"
atoms，coordiante = read_coordinate(filename1) #通过函数返回值，我们的到原子标号坐标和分子构型坐标

这里我们已经完成怎么从xyz文件中读取分子坐标和序号并保存到矩阵中，从上面的一号坐标和二号坐标和中我们可以看出来，二号坐标的2号C和3号C的循序发生的发生了对调，因此我们需要对坐标二号从新排序，让二号的坐标顺序和一号相同，便于后续我们使用构型进行线性内插，做势能面。下面展示完整代码，对分子构型进行重新排序。


import re #引入正则匹配库，正则匹配库是python自带的库不需要额外安装
import numpy as np#引入numpy矩阵库，我们需要使用numpy库来保存矩阵，以及尽心矩阵计算
from scipy.optimize import linear_sum_assignment #引入scipy数学计算库
from scipy.spatial.distance import cdist

#设置矩阵的显示方式
np.set_printoptions(threshold=10000000000,linewidth=1000000000,precision=8,suppress=True)

def print_geo(coordinate,atoms):
    natom =len(atoms)
    for i in range(natom):
        symbol = ''.join(format(atoms[i]))
        coordinate_x = ''.join(format(coordinate[i][0], '>20.10f'))
        coordinate_y = ''.join(format(coordinate[i][1], '>20.10f'))
        coordinate_z = ''.join(format(coordinate[i][2], '>20.10f'))
        print(symbol + coordinate_x + coordinate_y + coordinate_z)



def centeroid_translation(coordinate):
    """
    centeroid coordinate can be got equal to original coordinate minus centroid
    Parameters
    ----------
    coordinate : array
        (N,D) matrix, where N is points and D is dimension.
    Returns
    -------
        coordinate - center
    """
    center = centroid(coordinate)
    return coordinate - center


def reorder_distance(p_atoms, q_atoms, p_coord, q_coord):
    """
    Re-orders the input atom list and xyz coordinates by atom type and then by
    distance of each atom from the centroid.
    Parameters
    ----------
    atoms : array
        (N,1) matrix, where N is points holding the atoms' names
    coord : array
        (N,D) matrix, where N is points and D is dimension
    Returns
    -------
    atoms_reordered : array
        (N,1) matrix, where N is points holding the ordered atoms' names
    coords_reordered : array
        (N,D) matrix, where N is points and D is dimension (rows re-ordered)
    """

    # Find unique atoms
    unique_atoms = np.unique(p_atoms)

    # generate full view from q shape to fill in atom view on the fly
    view_reorder = np.zeros(q_atoms.shape, dtype=int)

    for atom in unique_atoms:

        (p_atom_idx,) = np.where(p_atoms == atom)
        (q_atom_idx,) = np.where(q_atoms == atom)

        A_coord = p_coord[p_atom_idx]
        B_coord = q_coord[q_atom_idx]

        # Calculate distance from each atom to centroid
        A_norms = np.linalg.norm(A_coord, axis=1)
        B_norms = np.linalg.norm(B_coord, axis=1)

        reorder_indices_A = np.argsort(A_norms)
        reorder_indices_B = np.argsort(B_norms)

        # Project the order of P onto Q
        translator = np.argsort(reorder_indices_A)
        view = reorder_indices_B[translator]
        view_reorder[p_atom_idx] = q_atom_idx[view]

    return view_reorder


def hungarian(A, B):
    """
    Hungarian reordering.
    Assume A and B are coordinates for atoms of SAME type only
    """

    # should be kabasch here i think
    distances = cdist(A, B, "euclidean")

    # Perform Hungarian analysis on distance matrix between atoms of 1st
    # structure and trial structure
    indices_a, indices_b = linear_sum_assignment(distances)

    return indices_b


def reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord):
    """
    Re-orders the input atom list and xyz coordinates using the Hungarian
    method (using optimized column results)
    Parameters
    ----------
    p_atoms : array
        (N,1) matrix, where N is points holding the atoms' names
    p_atoms : array
        (N,1) matrix, where N is points holding the atoms' names
    p_coord : array
        (N,D) matrix, where N is points and D is dimension
    q_coord : array
        (N,D) matrix, where N is points and D is dimension
    Returns
    -------
    view_reorder : array
             (N,1) matrix, reordered indexes of atom alignment based on the
             coordinates of the atoms
    """

    # Find unique atoms
    unique_atoms = np.unique(p_atoms)

    # generate full view from q shape to fill in atom view on the fly
    view_reorder = np.zeros(q_atoms.shape, dtype=int)
    view_reorder -= 1

    for atom in unique_atoms:
        (p_atom_idx,) = np.where(p_atoms == atom)
        (q_atom_idx,) = np.where(q_atoms == atom)

        A_coord = p_coord[p_atom_idx]
        B_coord = q_coord[q_atom_idx]

        view = hungarian(A_coord, B_coord)
        view_reorder[p_atom_idx] = q_atom_idx[view]

    return view_reorder





def read_coordinate(filename):
    """
    读取分子标号和分子构型函数
    ----------
    Returns
    -------
    atoms : array
        原子标号矩阵
    coordiante : array
        分子构型矩阵
    """
    atoms = [] 
    coordiante = []
    with open(filename,"r") as file:
        for line in file:
            regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)
            if regex:
                coordiante.append([float(i) for i in regex.group().split()[1:4]])
                atoms.append(regex.group().split()[0])
    coordiante = np.array(coordiante)
    atoms = np.array(atoms)

    return atoms,coordiante

filename1 = "c6h6.xyz"
filename2 = "c6h62.xyz"
atoms1,coordiante1 = read_coordinate(filename1) #读取构型和分子标号
atoms2,coordiante2 = read_coordinate(filename2) 
order = reorder_hungarian(atoms1,atoms2,coordiante1,coordiante2) #使用函数的到重新排序后的顺序
new_coordiante2 = coordiante2[order] #对二号坐标重新排序的到排序后的分子构型矩阵
new_atoms = atoms2[order]
print("重新排序")
print(order)
print(new_atoms)
print("重新排序的构型")
print_geo(new_coordiante2,new_atoms)

需要使用的同学自行修改代码。

Author：Dr.Shi

Link：https://sspaino.github.io/2023/03/13/7/

Publish date：March 13th 2023, 2:29:00 pm

Update date：March 13th 2023, 3:58:47 pm

License：本文采用知识共享署名-非商业性使用 4.0 国际许可协议进行许可

Next Post

libcint基础使用
Previous Post

Fortran学习笔记（二）

CATALOG