Skip to content

QM40_dataset_for_ML

gaussian_info_extractor module

gaussian_info_extractor module¶

`GaussianInfoExtractor` ¶

Class to Extract Qm parameters from Gaussian16 input file

current_dir_path (str): path for optimized compounds(each are inside a folder named Zinc_id) current_dir (str): Specific molecule's Zinc_id (folder names)

Source code in qm40_dataset_for_ml/gaussian_info_extractor.py

class GaussianInfoExtractor:
    """
    Class to Extract Qm parameters from Gaussian16 input file

    Args:
    current_dir_path (str): path for optimized compounds(each are inside a folder named Zinc_id)
    current_dir (str): Specific molecule's Zinc_id (folder names)

    """

    def __init__(self, current_dir_path, current_dir):
        self.current_dir_path = current_dir_path
        self.current_dir = current_dir
        self.output_data = None

    # read output files
    def output_file_reader(self) -> str:
        os.chdir(self.current_dir_path)
        gout_name = f"{self.current_dir}_run.out"
        gout_path = os.path.join(self.current_dir_path, gout_name)
        try:
            with open(gout_path, "r") as f:
                lines = f.readlines()
            self.output_data = (lines, gout_name)
        except FileNotFoundError:
            print(
                f"Gaussian out file {gout_name} not found in {self.current_dir_path}, skipping g_out data."
            )

        return self.output_data

    # find frequencies and report imaginary frequencies
    def imaginary_freq_checker(self) -> list:
        lines, out_file = self.output_file_reader()
        target_word = "Frequencies"
        for line in lines:
            row = line.strip()

            if target_word in row:
                freq_line = row
                values_str = freq_line.split()
                values_str = values_str[2:]
                values = [float(value.strip()) for value in values_str]
                for value in values:

                    if value < 0:
                        return self.current_dir

    def intial_coord_extract(self) -> list:
        ini_xyz = []
        xyz_file = f"{self.current_dir}.xyz"
        with open(xyz_file, "r") as file:
            first_line = file.readline().rstrip()
        length = int(first_line)
        ini_coord = []
        ini_coordinates = "Symbolic Z-matrix:"
        lines, gauss_out = self.output_file_reader()

        for i, line in enumerate(lines):
            row = line.strip()

            if ini_coordinates in row:

                for index in range(length):

                    initial_xyz = lines[index + 2 + i].strip().split()
                    ini_xyz.append(initial_xyz)
        return ini_xyz, length

    # Extract QM parameters ffrom gaussian output file using keywords
    def qm_info_collector(self) -> list:
        opt = []
        charges = []
        lines, gauss_out = self.output_file_reader()
        print(self.current_dir)
        start_reading = False
        start_line = (
            "#P Geom=AllCheck Guess=TCheck SCRF=Check GenChk RB3LYP/6-31G(2df,p) Fr"
        )
        opt_coordinates = "Standard orientation:"
        elec_E = "SCF Done"
        isotropic = "Isotropic polarizability"
        HOMO = "Alpha  occ. eigenvalues"
        Muliliken_charges = "Mulliken charges:"
        electro = "Electronic spatial extent (au):"
        vibrational = "Zero-point vibrational energy"
        dipol = "Dipole moment (field-independent basis, Debye):"
        Internal_energy = "Sum of electronic and thermal Energies="
        start_reading = False
        initial_coordinates, length = self.intial_coord_extract()
        for i, line in enumerate(lines):
            row = line.strip()

            if start_line in row:
                start_reading = True

            if not start_reading:
                continue

            if opt_coordinates in row:
                for num1 in range(length):
                    num1 = num1 + 5
                    num1 = i + num1
                    xyz = lines[num1].strip()
                    xyz_coord = xyz.split()[3:6]
                    opt.append(xyz_coord)

            elif elec_E in row:
                tot_ele_E_0K = row.split()[4]

            elif HOMO in row:
                last_HOMO_line = row.split()[-1]
                first_LUMO_line = lines[i + 1].strip().split()[4]

            elif isotropic in row:
                polarizability = row.split()[5]

            elif Muliliken_charges in row:
                for num in range(length):
                    num = num + 2
                    num = i + num
                    Mili = lines[num].strip().split()[2]
                    charges.append(Mili)

            elif dipol in row:
                electronic = lines[i - 2].strip().split()[5]
                dipol_moment = lines[i + 1].strip().split()[7]

            elif vibrational in row:
                zero_point = lines[i + 1].strip().split()[0]
                rotational = lines[i - 2].strip().split()
                rotational1 = rotational[3]
                rotational2 = rotational[4]
                rotational3 = rotational[5]

            elif Internal_energy in row:
                Internal_energy_295K = row.split()[6]
                Enthalpy = lines[i + 1].strip().split()[6]
                Free_E = lines[i + 2].strip().split()[7]
                cv_entropy = lines[i + 6].strip()
                cv = cv_entropy.split()[2]
                entropy = cv_entropy.split()[3]

        return [
            initial_coordinates,
            opt,
            charges,
            tot_ele_E_0K,
            last_HOMO_line,
            first_LUMO_line,
            polarizability,
            electronic,
            dipol_moment,
            zero_point,
            rotational1,
            rotational2,
            rotational3,
            Internal_energy_295K,
            Enthalpy,
            Free_E,
            cv,
            entropy,
        ]

    # making CSVs
    def making_csvs(self) -> list:
        n = self.qm_info_collector()
        qm_parameters = n[3:18]
        init_coordinates = n[0]
        init_name = ["atomic num", "ini_x", "init_y", "init_z"]
        df_init = pd.DataFrame(init_coordinates, columns=init_name)
        final_name = ["final_x", "final_y", "final_z"]
        df_final = pd.DataFrame(n[1], columns=final_name)
        charge_name = ["Charge"]
        df_charges = pd.DataFrame(n[2], columns=charge_name)
        df_coordinates = pd.concat([df_init, df_final, df_charges], axis=1)
        return qm_parameters, df_coordinates