Dr.Shi's Studio.

python实现自动重排分子原子标号

Word count: 2.2kReading time: 10 min
2023/03/13
loading

此文主要介绍作者用python实现自动重新排序的脚本

1.依赖 python >= 3.7
numpy 矩阵计算库
scipy 数学计算库
re 正则匹配库

主体函数如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

def hungarian(A, B):
"""
Hungarian reordering.
Assume A and B are coordinates for atoms of SAME type only
"""

# should be kabasch here i think
distances = cdist(A, B, "euclidean")

# Perform Hungarian analysis on distance matrix between atoms of 1st
# structure and trial structure
indices_a, indices_b = linear_sum_assignment(distances)

return indices_b


def reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord):
"""
Re-orders the input atom list and xyz coordinates using the Hungarian
method (using optimized column results)
Parameters
----------
p_atoms : array
(N,1) matrix, where N is points holding the atoms' names
p_atoms : array
(N,1) matrix, where N is points holding the atoms' names
p_coord : array
(N,D) matrix, where N is points and D is dimension
q_coord : array
(N,D) matrix, where N is points and D is dimension
Returns
-------
view_reorder : array
(N,1) matrix, reordered indexes of atom alignment based on the
coordinates of the atoms
"""

# Find unique atoms
unique_atoms = np.unique(p_atoms)

# generate full view from q shape to fill in atom view on the fly
view_reorder = np.zeros(q_atoms.shape, dtype=int)
view_reorder -= 1

for atom in unique_atoms:
(p_atom_idx,) = np.where(p_atoms == atom)
(q_atom_idx,) = np.where(q_atoms == atom)

A_coord = p_coord[p_atom_idx]
B_coord = q_coord[q_atom_idx]

view = hungarian(A_coord, B_coord)
view_reorder[p_atom_idx] = q_atom_idx[view]

return view_reorder
接下来我们需要使用这两个函数对不同序号的分子尽心重新排列,首先准备两个测试使用的苯分子的xyz文件如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
6
coordinate 1
C 0.00000000 0.00000000 0.00000000
C 1.39516000 0.00000000 0.00000000
C 2.09269800 1.20775100 0.00000000
C 1.39504400 2.41626000 -0.00119900
C 0.00021900 2.41618200 -0.00167800
C -0.69738200 1.20797600 -0.00068200
H -0.54975900 -0.95231700 0.00045000
H 1.94466800 -0.95251300 0.00131500
H 3.19237800 1.20783100 0.00063400
H 1.94524400 3.36840300 -0.00125800
H -0.54990300 3.36846300 -0.00263100
H -1.79698600 1.20815900 -0.00086200


6
coordinate2
C 0.00000000 0.00000000 0.00000000
C 2.09269800 1.20775100 0.00000000
C 1.39516000 0.00000000 0.00000000
C 1.39504400 2.41626000 -0.00119900
C 0.00021900 2.41618200 -0.00167800
C -0.69738200 1.20797600 -0.00068200
H -0.54975900 -0.95231700 0.00045000
H 1.94466800 -0.95251300 0.00131500
H 3.19237800 1.20783100 0.00063400
H 1.94524400 3.36840300 -0.00125800
H -0.54990300 3.36846300 -0.00263100
H -1.79698600 1.20815900 -0.00086200

可以将上述两个文件粘贴复制到文件中命名为c6h6.xyz和c6h62.xyz,首先我们需要将xyz文件中的构型和分子的标号读取到矩阵中,程序中使用python的numpy库来保存作坐标和原子标号信息。这里我将给出一个简单实例如何从xyz文件中读取结构。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

import re #引入正则匹配库,正则匹配库是python自带的库不需要额外安装
import numpy #引入numpy矩阵库,我们需要使用numpy库来保存矩阵,以及尽心矩阵计算


def read_coordinate(filename):#定义一个函数,其中参数需要一个文件名,这里我们要输入的是xyz文件
atoms = [] #定义一个列表,用来报错分子的标号
coordinate = [] #定义一个列表用来保存每一个原子的xyz坐标
with open(filename,"r") as file: #打开xyz文件
regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)#使用正则匹配文件中每一行的字符的格式,这里是一个通用的正则匹配形式,匹配每一行xyz坐标以及分子的标号
if regex:
#这里我们一下结果
print(line)


filename1 = "c6h6.xyz"
read_coordinate(filename1)

运行以上代码的到结果如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
C                  0.00000000    0.00000000    0.00000000

C 1.39516000 0.00000000 0.00000000

C 2.09269800 1.20775100 0.00000000

C 1.39504400 2.41626000 -0.00119900

C 0.00021900 2.41618200 -0.00167800

C -0.69738200 1.20797600 -0.00068200

H -0.54975900 -0.95231700 0.00045000

H 1.94466800 -0.95251300 0.00131500

H 3.19237800 1.20783100 0.00063400

H 1.94524400 3.36840300 -0.00125800

H -0.54990300 3.36846300 -0.00263100

H -1.79698600 1.20815900 -0.00086200
注意以上结果只是匹配到的字符,因此接下来我门需要将这些匹配的字符保存在矩阵中以便使用。 我们继续完善读取代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

import re #引入正则匹配库,正则匹配库是python自带的库不需要额外安装
import numpy as np#引入numpy矩阵库,我们需要使用numpy库来保存矩阵,以及尽心矩阵计算


def read_coordinate(filename):#定义一个函数,其中参数需要一个文件名,这里我们要输入的是xyz文件
atoms = [] #定义一个列表,用来报错分子的标号
coordinate = [] #定义一个列表用来保存每一个原子的xyz坐标
with open(filename,"r") as file: #打开xyz文件
regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)#使用正则匹配文件中每一行的字符的格式,这里是一个通用的正则匹配形式,匹配每一行xyz坐标以及分子的标号
if regex:
#这里我们一下结果
#print(line)
coordiante.append([float(i) for i in regex.group().split()[1:4]])#使用列表推导
atoms.append(regex.group().split()[0])
coordiante = np.array(coordiante)#将坐标保存到矩阵中
atoms = np.array(atoms)#将分子标号保存到矩阵中
return atoms,coordiante #返回分子标号矩阵,和坐标矩阵

filename1 = "c6h6.xyz"
atoms,coordiante = read_coordinate(filename1) #通过函数返回值,我们的到原子标号坐标和分子构型坐标

这里我们已经完成怎么从xyz文件中读取分子坐标和序号并保存到矩阵中,从上面的一号坐标和二号坐标和中我们可以看出来,二号坐标的2号C和3号C的循序发生的发生了对调,因此我们需要对坐标二号从新排序,让二号的坐标顺序和一号相同,便于后续我们使用构型进行线性内插,做势能面。下面展示完整代码,对分子构型进行重新排序。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

import re #引入正则匹配库,正则匹配库是python自带的库不需要额外安装
import numpy as np#引入numpy矩阵库,我们需要使用numpy库来保存矩阵,以及尽心矩阵计算
from scipy.optimize import linear_sum_assignment #引入scipy数学计算库
from scipy.spatial.distance import cdist

#设置矩阵的显示方式
np.set_printoptions(threshold=10000000000,linewidth=1000000000,precision=8,suppress=True)

def print_geo(coordinate,atoms):
natom =len(atoms)
for i in range(natom):
symbol = ''.join(format(atoms[i]))
coordinate_x = ''.join(format(coordinate[i][0], '>20.10f'))
coordinate_y = ''.join(format(coordinate[i][1], '>20.10f'))
coordinate_z = ''.join(format(coordinate[i][2], '>20.10f'))
print(symbol + coordinate_x + coordinate_y + coordinate_z)



def centeroid_translation(coordinate):
"""
centeroid coordinate can be got equal to original coordinate minus centroid
Parameters
----------
coordinate : array
(N,D) matrix, where N is points and D is dimension.
Returns
-------
coordinate - center
"""
center = centroid(coordinate)
return coordinate - center


def reorder_distance(p_atoms, q_atoms, p_coord, q_coord):
"""
Re-orders the input atom list and xyz coordinates by atom type and then by
distance of each atom from the centroid.
Parameters
----------
atoms : array
(N,1) matrix, where N is points holding the atoms' names
coord : array
(N,D) matrix, where N is points and D is dimension
Returns
-------
atoms_reordered : array
(N,1) matrix, where N is points holding the ordered atoms' names
coords_reordered : array
(N,D) matrix, where N is points and D is dimension (rows re-ordered)
"""

# Find unique atoms
unique_atoms = np.unique(p_atoms)

# generate full view from q shape to fill in atom view on the fly
view_reorder = np.zeros(q_atoms.shape, dtype=int)

for atom in unique_atoms:

(p_atom_idx,) = np.where(p_atoms == atom)
(q_atom_idx,) = np.where(q_atoms == atom)

A_coord = p_coord[p_atom_idx]
B_coord = q_coord[q_atom_idx]

# Calculate distance from each atom to centroid
A_norms = np.linalg.norm(A_coord, axis=1)
B_norms = np.linalg.norm(B_coord, axis=1)

reorder_indices_A = np.argsort(A_norms)
reorder_indices_B = np.argsort(B_norms)

# Project the order of P onto Q
translator = np.argsort(reorder_indices_A)
view = reorder_indices_B[translator]
view_reorder[p_atom_idx] = q_atom_idx[view]

return view_reorder


def hungarian(A, B):
"""
Hungarian reordering.
Assume A and B are coordinates for atoms of SAME type only
"""

# should be kabasch here i think
distances = cdist(A, B, "euclidean")

# Perform Hungarian analysis on distance matrix between atoms of 1st
# structure and trial structure
indices_a, indices_b = linear_sum_assignment(distances)

return indices_b


def reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord):
"""
Re-orders the input atom list and xyz coordinates using the Hungarian
method (using optimized column results)
Parameters
----------
p_atoms : array
(N,1) matrix, where N is points holding the atoms' names
p_atoms : array
(N,1) matrix, where N is points holding the atoms' names
p_coord : array
(N,D) matrix, where N is points and D is dimension
q_coord : array
(N,D) matrix, where N is points and D is dimension
Returns
-------
view_reorder : array
(N,1) matrix, reordered indexes of atom alignment based on the
coordinates of the atoms
"""

# Find unique atoms
unique_atoms = np.unique(p_atoms)

# generate full view from q shape to fill in atom view on the fly
view_reorder = np.zeros(q_atoms.shape, dtype=int)
view_reorder -= 1

for atom in unique_atoms:
(p_atom_idx,) = np.where(p_atoms == atom)
(q_atom_idx,) = np.where(q_atoms == atom)

A_coord = p_coord[p_atom_idx]
B_coord = q_coord[q_atom_idx]

view = hungarian(A_coord, B_coord)
view_reorder[p_atom_idx] = q_atom_idx[view]

return view_reorder





def read_coordinate(filename):
"""
读取分子标号和分子构型函数
----------
Returns
-------
atoms : array
原子标号矩阵
coordiante : array
分子构型矩阵
"""
atoms = []
coordiante = []
with open(filename,"r") as file:
for line in file:
regex = re.search("[a-zA-Z]{1,2}(\s+-?\d+\.\d+){3}",line)
if regex:
coordiante.append([float(i) for i in regex.group().split()[1:4]])
atoms.append(regex.group().split()[0])
coordiante = np.array(coordiante)
atoms = np.array(atoms)

return atoms,coordiante

filename1 = "c6h6.xyz"
filename2 = "c6h62.xyz"
atoms1,coordiante1 = read_coordinate(filename1) #读取构型和分子标号
atoms2,coordiante2 = read_coordinate(filename2)
order = reorder_hungarian(atoms1,atoms2,coordiante1,coordiante2) #使用函数的到重新排序后的顺序
new_coordiante2 = coordiante2[order] #对二号坐标重新排序的到排序后的分子构型矩阵
new_atoms = atoms2[order]
print("重新排序")
print(order)
print(new_atoms)
print("重新排序的构型")
print_geo(new_coordiante2,new_atoms)


需要使用的同学自行修改代码。

CATALOG