diff --git a/source/module_basis/module_nao/numerical_radial.cpp b/source/module_basis/module_nao/numerical_radial.cpp
index 2b8b90d314..8355d31156 100644
--- a/source/module_basis/module_nao/numerical_radial.cpp
+++ b/source/module_basis/module_nao/numerical_radial.cpp
@@ -365,17 +365,21 @@ void NumericalRadial::radtab(const char op,
                              const int l,
                              double* const table,
                              const int nr_tab,
-                             const double* const rgrid_tab,
+                             const double rmax_tab,
                              const bool deriv) const
 {
     assert(op == 'S' || op == 'I' || op == 'T' || op == 'U');
     assert(l >= 0);
-    assert(rgrid_tab && nr_tab > 0);
+    assert(rmax_tab > 0 && nr_tab > 0);
 
     // radtab requires that two NumericalRadial objects have exactly the same (non-null) kgrid_
     assert(nk_ > 0 && nk_ == ket.nk_);
     assert(std::equal(kgrid_, kgrid_ + nk_, ket.kgrid_));
 
+    double* rgrid_tab = new double[nr_tab];
+    double dr = rmax_tab / (nr_tab - 1);
+    std::for_each(rgrid_tab, rgrid_tab + nr_tab, [dr,&rgrid_tab](double& r) { r = dr * (&r - rgrid_tab); });
+
     bool use_radrfft = is_fft_compliant(nr_tab, rgrid_tab, nk_, kgrid_);
 
     // function to undergo a spherical Bessel transform:
@@ -407,6 +411,7 @@ void NumericalRadial::radtab(const char op,
     }
 
     delete[] fk;
+    delete[] rgrid_tab;
 
     // spherical Bessel transform has a prefactor of sqrt(2/pi) while the prefactor for the radial table
     // of two-center integrals is 4*pi
diff --git a/source/module_basis/module_nao/numerical_radial.h b/source/module_basis/module_nao/numerical_radial.h
index 90435af821..e9f5238b27 100644
--- a/source/module_basis/module_nao/numerical_radial.h
+++ b/source/module_basis/module_nao/numerical_radial.h
@@ -191,17 +191,17 @@ class NumericalRadial
      *                          /  0                  l
      *
      *                                                                                  */
-    void radtab(const char op,                 //!< [in] operator, could be:
-                                               //!<        - 'S' or 'I': overlap
-                                               //!<        - 'T': kinetic
-                                               //!<        - 'U': Coulomb
-                const NumericalRadial& ket,    //!< [in] the other NumericalRadial object with which
-                                               //!       the two-center integral is computed
-                const int l,                   //!< [in] angular momentum of the table
-                double* const table,           //!< [out] on finish, contain the computed table
-                const int nr_tab,              //!< [in] size of table grid
-                const double* const rgrid_tab, //!< [in] grid on which the table is calculated.
-                const bool deriv = false       //!< [in] if true, calculates the derivative of the table
+    void radtab(const char op,              //!< [in] operator, could be:
+                                            //!<        - 'S' or 'I': overlap
+                                            //!<        - 'T': kinetic
+                                            //!<        - 'U': Coulomb
+                const NumericalRadial& ket, //!< [in] the other NumericalRadial object with which
+                                            //!       the two-center integral is computed
+                const int l,                //!< [in] angular momentum of the table
+                double* const table,        //!< [out] on finish, contain the computed table
+                const int nr_tab,           //!< [in] size of table grid
+                const double rmax_tab,      //!< [in] cutoff radius of table grid
+                const bool deriv = false    //!< [in] if true, calculates the derivative of the table
     ) const;
 
     //! Normalizes the radial function.
diff --git a/source/module_basis/module_nao/test/CMakeLists.txt b/source/module_basis/module_nao/test/CMakeLists.txt
index b935cbab22..11d4bb58e0 100644
--- a/source/module_basis/module_nao/test/CMakeLists.txt
+++ b/source/module_basis/module_nao/test/CMakeLists.txt
@@ -60,3 +60,17 @@ AddTest(
   LIBS ${math_libs} device base container kernels
 )
 
+AddTest(
+  TARGET two_center_integrator
+  SOURCES
+    two_center_integrator_test.cpp
+    ../two_center_integrator.cpp
+    ../two_center_table.cpp
+    ../real_gaunt_table.cpp
+    ../radial_collection.cpp
+    ../atomic_radials.cpp
+    ../beta_radials.cpp
+    ../radial_set.cpp
+    ../numerical_radial.cpp
+  LIBS ${math_libs} device base container kernels orb
+)
diff --git a/source/module_basis/module_nao/test/numerical_radial_test.cpp b/source/module_basis/module_nao/test/numerical_radial_test.cpp
index 21f2904661..9d39a9129e 100644
--- a/source/module_basis/module_nao/test/numerical_radial_test.cpp
+++ b/source/module_basis/module_nao/test/numerical_radial_test.cpp
@@ -472,29 +472,30 @@ TEST_F(NumericalRadialTest, RadialTable)
 
     double* table = new double[sz];
     double table_pref = ModuleBase::FOUR_PI * std::sqrt(ModuleBase::PI / 2.0);
+    double rmax_tab = chi1.rcut();
 
-    chi1.radtab('S', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid());
+    chi1.radtab('S', chi2, 0, table, chi1.nr(), rmax_tab);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * (3 - R * R) / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('S', chi2, 2, table, chi1.nr(), chi1.ptr_rgrid());
+    chi1.radtab('S', chi2, 2, table, chi1.nr(), rmax_tab);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * R * R / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('T', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid());
+    chi1.radtab('T', chi2, 0, table, chi1.nr(), rmax_tab);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * (std::pow(R, 4) - 10 * R * R + 15) / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('U', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid());
+    chi1.radtab('U', chi2, 0, table, chi1.nr(), rmax_tab);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
@@ -502,28 +503,28 @@ TEST_F(NumericalRadialTest, RadialTable)
     }
 
     // derivative of radial tables
-    chi1.radtab('S', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid(), true);
+    chi1.radtab('S', chi2, 0, table, chi1.nr(), rmax_tab, true);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * (std::pow(R, 3) - 5 * R) / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('S', chi2, 2, table, chi1.nr(), chi1.ptr_rgrid(), true);
+    chi1.radtab('S', chi2, 2, table, chi1.nr(), rmax_tab, true);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * (2 * R - std::pow(R, 3)) / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('T', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid(), true);
+    chi1.radtab('T', chi2, 0, table, chi1.nr(), rmax_tab, true);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
         EXPECT_NEAR(table[i], table_pref * (-std::pow(R, 5) + 14 * std::pow(R, 3) - 35 * R) / 32 * std::exp(-R * R / 2), tol);
     }
 
-    chi1.radtab('U', chi2, 0, table, chi1.nr(), chi1.ptr_rgrid(), true);
+    chi1.radtab('U', chi2, 0, table, chi1.nr(), rmax_tab, true);
     for (int i = 0; i != sz; ++i)
     {
         double R = i * dr;
diff --git a/source/module_basis/module_nao/test/two_center_integrator_test.cpp b/source/module_basis/module_nao/test/two_center_integrator_test.cpp
new file mode 100644
index 0000000000..4b218f3cad
--- /dev/null
+++ b/source/module_basis/module_nao/test/two_center_integrator_test.cpp
@@ -0,0 +1,346 @@
+#include "module_basis/module_nao/two_center_integrator.h"
+
+#include "gtest/gtest.h"
+#include "module_base/constants.h"
+#include "module_base/spherical_bessel_transformer.h"
+#include "module_base/vector3.h"
+#include "module_base/ylm.h"
+
+#include <cstring>
+#include <chrono>
+using iclock = std::chrono::high_resolution_clock;
+
+//////////////////////////////////////////
+//! for comparison with module_ao
+#include "module_basis/module_ao/ORB_table_phi.h"
+//////////////////////////////////////////
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+/***********************************************************
+ *      Unit test of class "TwoCenterTable"
+ ***********************************************************/
+/*!
+ *  Tested functions:
+ *
+ *  - build
+ *      - builds a two-center integral radial table from two RadialCollection objects
+ *                                                                      */
+class TwoCenterIntegratorTest : public ::testing::Test
+{
+  protected:
+    void SetUp();
+    void TearDown();
+
+    TwoCenterIntegrator S_intor;
+    TwoCenterIntegrator T_intor;
+
+    RadialCollection orb;
+    int nfile = 0;                                                   //! number of orbital files
+    std::string* file = nullptr;                                     //!< orbital files to read from
+    std::string log_file = "./test_files/two_center_integrator.log"; //!< file for logging
+
+    double elem_tol = 1e-6; //! tolerance for comparison between new and legacy matrix elements
+};
+
+void TwoCenterIntegratorTest::SetUp()
+{
+#ifdef __MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK);
+#endif
+
+    std::string dir = "../../../../../tests/PP_ORB/";
+    nfile = 8;
+    file = new std::string[nfile];
+    file[0] = dir + "C_gga_8au_100Ry_2s2p1d.orb";
+    file[1] = dir + "Fe_gga_9au_100Ry_4s2p2d1f.orb";
+    file[2] = dir + "O_gga_10au_100Ry_2s2p1d.orb";
+    file[3] = dir + "H_gga_8au_60Ry_2s1p.orb";
+    file[4] = dir + "Cs_gga_10au_100Ry_4s2p1d.orb";
+    file[5] = dir + "Pb_gga_7au_100Ry_2s2p2d1f.orb";
+    file[6] = dir + "F_gga_7au_100Ry_2s2p1d.orb";
+    file[7] = dir + "I_gga_7au_100Ry_2s2p2d1f.orb";
+
+    ModuleBase::Ylm::set_coefficients();
+}
+
+void TwoCenterIntegratorTest::TearDown()
+{
+    delete[] file;
+}
+
+TEST_F(TwoCenterIntegratorTest, FiniteDifference)
+{
+    nfile = 3;
+    orb.build(nfile, file, 'o');
+    double rmax = orb.rcut_max() * 2.0;
+    double dr = 0.01;
+    int nr = static_cast<int>(rmax / dr) + 1;
+    
+    //ModuleBase::SphericalBesselTransformer sbt;
+    //sbt.set_fftw_plan_flag(FFTW_MEASURE); // not necessarily worth it!
+    //orb.set_transformer(&sbt, 0);
+    orb.set_uniform_grid(true, nr, rmax, 'i', true);
+
+    iclock::time_point start;
+    std::chrono::duration<double> dur;
+
+    start = iclock::now();
+
+    RealGauntTable rgt;
+    S_intor.tabulate(orb, orb, 'S', nr, rmax, true, &rgt);
+    T_intor.tabulate(orb, orb, 'T', nr, rmax, true, &rgt);
+
+    dur = iclock::now() - start;
+    std::cout << "time elapsed = " << dur.count() << " s" << std::endl;
+
+    // check whether analytical derivative and finite difference agree
+    int ntype = nfile;
+    double tol_d = 1e-4;
+
+    ModuleBase::Vector3<double> vR0 = {1.0, 2.0, 3.0};
+    ModuleBase::Vector3<double> vR;
+
+	for (int t1 = 0;  t1 < ntype ; t1++)
+	{
+		for (int l1 = 0; l1 <= orb(t1).lmax(); l1++)
+		{
+			for (int izeta1 = 0; izeta1 < orb(t1).nzeta(l1); izeta1++)
+			{
+                for (int m1 = -l1; m1 <= l1; ++m1)
+                {
+		            for (int t2 = t1 ; t2 < ntype ; t2++)
+		            {
+					    for (int l2 = 0; l2 <= orb(t2).lmax(); l2++)
+					    {
+					    	for (int izeta2 = 0; izeta2 < orb(t2).nzeta(l2); izeta2++)
+					    	{
+                                for (int m2 = -l2; m2 <= l2; ++m2)
+                                {
+                                    double dx = 1e-4;
+                                    double elem_p;
+                                    double elem_m;
+                                    double grad_elem[3];
+
+                                    // S
+                                    vR = vR0;
+                                    vR[2] += dx;
+                                    S_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, false, &elem_p);
+
+                                    vR = vR0;
+                                    vR[2] -= dx;
+                                    S_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, false, &elem_m);
+
+                                    S_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, true, grad_elem);
+
+                                    EXPECT_NEAR( (elem_p - elem_m) / (2. * dx), grad_elem[2], tol_d);
+
+                                    // T
+                                    vR = vR0;
+                                    vR[2] += dx;
+                                    T_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, false, &elem_p);
+
+                                    vR = vR0;
+                                    vR[2] -= dx;
+                                    T_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, false, &elem_m);
+
+                                    T_intor.calculate(t1, l1, izeta1, m1, t2, l2, izeta2, m2, vR, true, grad_elem);
+
+                                    EXPECT_NEAR( (elem_p - elem_m) / (2. * dx), grad_elem[2], tol_d);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+//TEST_F(TwoCenterIntegratorTest, LegacyConsistency)
+//{
+//    // use less files so that the test wouldn't take too long
+//    nfile = 3;
+//
+//    orb.build(nfile, file, 'o');
+//    double rmax = orb.rcut_max() * 2.0;
+//    double dr = 0.01;
+//    int nr = static_cast<int>(rmax / dr) + 1;
+//    
+//    //ModuleBase::SphericalBesselTransformer sbt;
+//    //orb.set_transformer(&sbt, 0);
+//    orb.set_uniform_grid(true, nr, rmax, 'i', true);
+//
+//    iclock::time_point start;
+//    std::chrono::duration<double> dur;
+//
+//    start = iclock::now();
+//    S_tab.build(orb, orb, 'S', nr, rmax, true);
+//    T_tab.build(orb, orb, 'T', nr, rmax, true);
+//    dur = iclock::now() - start;
+//    std::cout << "radfft time elapsed = " << dur.count() << " s" << std::endl;
+//
+//    //////////////////////////////////////////////
+//    //      module_ao ORB_table_phi
+//    //////////////////////////////////////////////
+//
+//    ORB_table_phi otp;
+//    LCAO_Orbitals lcao;
+//    ModuleBase::Sph_Bessel_Recursive::D2 sbr;
+//
+//    // prepare data required to initialized ORB_table_phi
+//    std::ofstream ofs_log;
+//    int ntype;
+//    int lmax;
+//    int out_mat_r; // unused variable
+//    bool force_flag;
+//    int my_rank;
+//    bool deepks_setorb;
+//    bool read_in_flag;
+//    std::string descriptor_file;
+//    std::vector<std::string> orbital_file;
+//    double ecutwfc;
+//    double dk;
+//    double dR;
+//    double Rmax;
+//
+//    ofs_log.open("ORB_table_phi_test.log");
+//    ntype = nfile;
+//    lmax = 3;
+//    out_mat_r = 0; // unused variable
+//    force_flag = true;
+//    my_rank = 0;
+//    deepks_setorb = false;
+//
+//    read_in_flag = true;
+//    for (int i = 0; i != nfile; ++i) {
+//        orbital_file.push_back(file[i]);
+//    }
+//
+//    // below we mimic ORB_control::read_orb_first
+//    ecutwfc = 10000.0; // ecutwfc has to be very large in order to reach the agreement between new & old tables
+//    dk = 0.01;
+//    dR = 0.01;
+//    Rmax = 30;
+//
+//    lcao.read_in_flag = read_in_flag;
+//    lcao.orbital_file = orbital_file;
+//#ifdef __MPI
+//    lcao.bcast_files(ntype, GlobalV::MY_RANK);
+//#endif
+//
+//    // see ORB_control::read_orb_first
+//    lcao.ecutwfc = ecutwfc;
+//    lcao.dk = dk;
+//    lcao.dR = dR;
+//    lcao.Rmax = Rmax;
+//
+//    lcao.Read_Orbitals(ofs_log, ntype, lmax, deepks_setorb, out_mat_r,
+//            force_flag, my_rank);
+//
+//    otp.allocate(ntype, lmax, lcao.get_kmesh(), Rmax, dR, dk);
+//
+//    auto calc_nr = [](double const& Rmax, double const& dR) {
+//        int nr = static_cast<int>(Rmax / dR) + 4;
+//        return (nr%2) ? nr : nr+1;
+//    };
+//
+//    // initialize spherical bessel table
+//    sbr.set_dx(dR*dk);
+//
+//    // max(l+l) = 4, but the derivative calculation of j_l relies on j_{l+1}
+//    sbr.cal_jlx(2*lmax+1, calc_nr(Rmax, dR), lcao.get_kmesh());
+//
+//    otp.pSB = &sbr;
+//    otp.init_OV_Tpair(lcao);
+//    otp.init_OV_Opair(lcao);
+//
+//    start = iclock::now();
+//    otp.init_Table(lcao);
+//    dur = iclock::now() - start;
+//    std::cout << "legacy time elapsed = " << dur.count() << " s" << std::endl;
+//
+//	for (int T1 = 0;  T1 < ntype ; T1++)
+//	{
+//		for (int T2 = T1 ; T2 < ntype ; T2++)
+//		{
+//			int Tpair=otp.OV_Tpair(T1,T2);
+//
+//			for (int L1 = 0; L1 <= lcao.Phi[T1].getLmax(); L1++)
+//			{
+//				for (int N1 = 0; N1 < lcao.Phi[T1].getNchi(L1); N1++)
+//				{
+//					for (int L2 = 0; L2 <= lcao.Phi[T2].getLmax(); L2 ++)
+//					{
+//						for (int N2 = 0; N2 < lcao.Phi[T2].getNchi(L2); N2++)
+//						{
+//							int Opair = otp.OV_Opair(Tpair,L1,L2,N1,N2);
+//
+//							for (int L = std::abs(L1-L2); L <= (L1+L2) ; L += 2)
+//							{
+//                                // table whose integrand has a higher order of k is less accurate
+//                                // as it requires a larger ecutwfc to converge.
+//
+//                                //if (std::abs(S_tab.table(T1, L1, N1, T2, L2, N2, L)[0] -
+//                                //        otp.Table_SR[0][Tpair][Opair][L][0]) > 1e-4) {
+//                                //    std::cout << "T1 = " << T1 << ", L1 = " << L1 << ", N1 = " << N1 << std::endl;
+//                                //    std::cout << "T2 = " << T2 << ", L2 = " << L2 << ", N2 = " << N2 << std::endl;
+//                                //    std::cout << "L = " << L << std::endl;
+//
+//                                //    for (int ir = 1; ir < 10; ++ir) {
+//                                //        double rl = std::pow(ir * dr, L);
+//                                //        printf("%20.15e   %20.15e\n", S_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl, otp.Table_SR[0][Tpair][Opair][L][ir]);
+//                                //    }
+//                                //}
+//
+//                                // R == 0
+//                                //EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L)[0],
+//                                //        otp.Table_SR[0][Tpair][Opair][L][0], table_tol);
+//                                //EXPECT_NEAR(S_dtab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+//                                //        otp.Table_SR[1][Tpair][Opair][L][0], table_tol);
+//                                //EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+//                                //        otp.Table_TR[0][Tpair][Opair][L][0], table_tol * 10);
+//                                //EXPECT_NEAR(T_dtab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+//                                //        otp.Table_TR[1][Tpair][Opair][L][0], table_tol * 100);
+//
+//                                // R > 0
+//                                for (int ir = 1; ir != 1600; ++ir) {
+//                                    double rl = std::pow(ir * dr, L);
+//                                    EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl,
+//                                            otp.Table_SR[0][Tpair][Opair][L][ir], table_tol);
+//                                    EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L, true)[ir], 
+//                                            otp.Table_SR[1][Tpair][Opair][L][ir], table_tol);
+//                                    EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl, 
+//                                            otp.Table_TR[0][Tpair][Opair][L][ir], table_tol * 10);
+//                                    EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L, true)[ir], 
+//                                            otp.Table_TR[1][Tpair][Opair][L][ir], table_tol * 100);
+//                                }
+//							}
+//						}
+//					}
+//				}
+//			}
+//		}
+//	}
+//
+//    otp.Destroy_Table(lcao);
+//}
+
+int main(int argc, char** argv)
+{
+
+#ifdef __MPI
+    MPI_Init(&argc, &argv);
+#endif
+
+    testing::InitGoogleTest(&argc, argv);
+    int result = RUN_ALL_TESTS();
+
+#ifdef __MPI
+    MPI_Finalize();
+#endif
+
+    return result;
+}
diff --git a/source/module_basis/module_nao/test/two_center_table_test.cpp b/source/module_basis/module_nao/test/two_center_table_test.cpp
index 6b6b461c20..b3da3996d5 100644
--- a/source/module_basis/module_nao/test/two_center_table_test.cpp
+++ b/source/module_basis/module_nao/test/two_center_table_test.cpp
@@ -6,6 +6,7 @@
 
 #include "module_base/math_integral.h"
 
+#include <cstring>
 #include <chrono>
 using iclock = std::chrono::high_resolution_clock;
 
@@ -34,12 +35,9 @@ class TwoCenterTableTest : public ::testing::Test
     void TearDown();
 
     TwoCenterTable S_tab;
-    TwoCenterTable S_dtab;
     TwoCenterTable T_tab;
-    TwoCenterTable T_dtab;
 
     TwoCenterTable betapsi_tab;
-    TwoCenterTable betapsi_dtab;
 
     RadialCollection orb;
     int nfile = 0;                                              //! number of orbital files
@@ -85,35 +83,71 @@ TEST_F(TwoCenterTableTest, BuildOverlapAndKinetic)
     //orb.set_transformer(&sbt, 0);
     orb.set_uniform_grid(true, nr, rmax, 'i', true);
 
-    const double* rgrid = orb(0,0,0).ptr_rgrid();
-
     iclock::time_point start;
     std::chrono::duration<double> dur;
 
     start = iclock::now();
-    S_tab.build(orb, orb, 'S', nr, rgrid, false);
-    T_tab.build(orb, orb, 'T', nr, rgrid, false);
-    S_dtab.build(orb, orb, 'S', nr, rgrid, true);
-    T_dtab.build(orb, orb, 'T', nr, rgrid, true);
+    S_tab.build(orb, orb, 'S', nr, rmax, true);
+    T_tab.build(orb, orb, 'T', nr, rmax, true);
     dur = iclock::now() - start;
     std::cout << "time elapsed = " << dur.count() << " s" << std::endl;
 
     // check whether analytical derivative and finite difference agree
-    for (int i = 0; i != S_tab.ntab(); ++i)
-    {
-        const double* f = S_tab.ptr_table(0, 0, 0, 0, 0, 0, 0) + i * S_tab.nr();
-        const double* df = S_dtab.ptr_table(0, 0, 0, 0, 0, 0, 0) + i * S_dtab.nr();
-        for (int ir = 5; ir != S_tab.nr() - 4; ++ir)
-        {
-            double df_fd = ( +1.0/280*f[ir-4] - 4.0/105*f[ir-3] 
-                      +1.0/5*f[ir-2] - 4.0/5*f[ir-1] 
-                      -1.0/280*f[ir+4] + 4.0/105*f[ir+3] 
-                      -1.0/5*f[ir+2] + 4.0/5*f[ir+1]
-                    ) / 0.01;
-
-            EXPECT_NEAR(df_fd, df[ir], 1e-5);
+    int ntype = nfile;
+    double tol_d = 1e-5;
+    double* f = new double[nr];
+	for (int T1 = 0;  T1 < ntype ; T1++)
+	{
+		for (int T2 = T1 ; T2 < ntype ; T2++)
+		{
+			for (int L1 = 0; L1 <= orb(T1).lmax(); L1++)
+			{
+				for (int N1 = 0; N1 < orb(T1).nzeta(L1); N1++)
+				{
+					for (int L2 = 0; L2 <= orb(T2).lmax(); L2++)
+					{
+						for (int N2 = 0; N2 < orb(T2).nzeta(L2); N2++)
+						{
+							for (int L = std::abs(L1-L2); L <= (L1+L2) ; L += 2)
+                            {
+                                const double* tmp = S_tab.table(T1, L1, N1, T2, L2, N2, L, false);
+                                const double* df  = S_tab.table(T1, L1, N1, T2, L2, N2, L, true);
+
+                                std::memcpy(f, tmp, nr * sizeof(double));
+
+                                // currently f is S(R)/R^l; multiply by R^l to get S(R)
+                                if (L > 0)
+                                {
+                                    std::for_each(f, f + nr, [dr, L, &f] (double& val) { val *= std::pow((&val - f) * dr, L); });
+                                }
+
+                                for (int ir = 4; ir != S_tab.nr() - 4; ++ir)
+                                {
+                                    double df_fd = ( - 1.0/280 *(f[ir+4] - f[ir-4])
+                                                     + 4.0/105 *(f[ir+3] - f[ir-3])
+                                                     - 0.2 * (f[ir+2] - f[ir-2])
+                                                     + 0.8 * (f[ir+1] - f[ir-1]) ) / dr;
+
+                                    EXPECT_NEAR(df_fd, df[ir], tol_d);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
         }
     }
+    delete[] f;
+
+
+    EXPECT_EQ(S_tab.with_deriv(), true);
+    EXPECT_EQ(T_tab.with_deriv(), true);
+
+    EXPECT_EQ(S_tab.nr(), nr);
+    EXPECT_EQ(T_tab.nr(), nr);
+
+    EXPECT_EQ(S_tab.rmax(), rmax);
+    EXPECT_EQ(T_tab.rmax(), rmax);
 }
 
 TEST_F(TwoCenterTableTest, LegacyConsistency)
@@ -130,16 +164,12 @@ TEST_F(TwoCenterTableTest, LegacyConsistency)
     //orb.set_transformer(&sbt, 0);
     orb.set_uniform_grid(true, nr, rmax, 'i', true);
 
-    const double* rgrid = orb(0,0,0).ptr_rgrid();
-
     iclock::time_point start;
     std::chrono::duration<double> dur;
 
     start = iclock::now();
-    S_tab.build(orb, orb, 'S', nr, rgrid, false);
-    T_tab.build(orb, orb, 'T', nr, rgrid, false);
-    S_dtab.build(orb, orb, 'S', nr, rgrid, true);
-    T_dtab.build(orb, orb, 'T', nr, rgrid, true);
+    S_tab.build(orb, orb, 'S', nr, rmax, true);
+    T_tab.build(orb, orb, 'T', nr, rmax, true);
     dur = iclock::now() - start;
     std::cout << "radfft time elapsed = " << dur.count() << " s" << std::endl;
 
@@ -223,41 +253,6 @@ TEST_F(TwoCenterTableTest, LegacyConsistency)
     dur = iclock::now() - start;
     std::cout << "legacy time elapsed = " << dur.count() << " s" << std::endl;
 
-    //double* tmp = new double[2001];
-    //const NumericalRadial& f = orb(1,3,0);
-    //f.radtab('T', f, 6, tmp);
-    //for (int ir = 0; ir != 21; ++ir) {
-    //    std::cout << tmp[ir] << std::endl;
-    //}
-    //std::cout << std::endl;
-
-//    double* ktmp = new double[f.nk()];
-//    for (int ik = 0; ik != f.nk(); ++ik) {
-//        ktmp[ik] = std::pow(f.ptr_kvalue()[ik] * f.ptr_kgrid()[ik] * f.ptr_kgrid()[ik],2);
-//    }
-//    double r1 = f.ptr_rgrid()[1];
-
-
-    std::cout << std::endl;
-
-    // compare the table
-    // length of table depends on the cutoff radius
-    //for (int ir = 0; ir != 1600; ++ir) {
-    //    double err = std::abs(T_tab.ptr_table(1,3,0,1,3,0,6)[ir] - otp.Table_TR[0][3][80][6][ir]);
-    //    if (err > table_tol) {
-    //        std::cout << "                 ir = " << ir << std::endl;
-    //    }
-    //    EXPECT_NEAR(T_tab.ptr_table(1,3,0,1,3,0,6)[ir], otp.Table_TR[0][3][80][6][ir], table_tol);
-    //    //EXPECT_NEAR(S_tab.ptr_table(0,0,0,0,0,0,0)[ir], otp.Table_SR[0][0][0][0][ir], table_tol);
-    //    //EXPECT_NEAR(T_dtab.ptr_table(0,0,0,0,0,0,0)[ir], otp.Table_TR[1][0][0][0][ir], table_tol);
-    //    //EXPECT_NEAR(S_dtab.ptr_table(0,0,0,0,0,0,0)[ir], otp.Table_SR[1][0][0][0][ir], table_tol);
-    //}
-
-    //std::cout << T_tab.nr() << std::endl;
-    //std::cout << otp.OV_Tpair(1,1) << std::endl;
-    //std::cout << otp.OV_Opair(otp.OV_Tpair(1,1), 3, 3, 0, 0) << std::endl;
-
-    ////////////////////////////////////////////
 	for (int T1 = 0;  T1 < ntype ; T1++)
 	{
 		for (int T2 = T1 ; T2 < ntype ; T2++)
@@ -276,40 +271,42 @@ TEST_F(TwoCenterTableTest, LegacyConsistency)
 
 							for (int L = std::abs(L1-L2); L <= (L1+L2) ; L += 2)
 							{
-                                //if (T1 == 0 && L1 == 1 && N1 == 1 && T2 == 0 && L2 == 1 && N2 == 1 && L == 0) {
-                                //    for (int ir = 1; ir != 16; ++ir) {
-                                //        printf("%12.8f   %12.8f\n",
-                                //                T_dtab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir], 
-                                //                otp.Table_TR[1][Tpair][Opair][L][ir]);
+                                // table whose integrand has a higher order of k is less accurate
+                                // as it requires a larger ecutwfc to converge.
+
+                                //if (std::abs(S_tab.table(T1, L1, N1, T2, L2, N2, L)[0] -
+                                //        otp.Table_SR[0][Tpair][Opair][L][0]) > 1e-4) {
+                                //    std::cout << "T1 = " << T1 << ", L1 = " << L1 << ", N1 = " << N1 << std::endl;
+                                //    std::cout << "T2 = " << T2 << ", L2 = " << L2 << ", N2 = " << N2 << std::endl;
+                                //    std::cout << "L = " << L << std::endl;
+
+                                //    for (int ir = 1; ir < 10; ++ir) {
+                                //        double rl = std::pow(ir * dr, L);
+                                //        printf("%20.15e   %20.15e\n", S_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl, otp.Table_SR[0][Tpair][Opair][L][ir]);
                                 //    }
                                 //}
-                                //break;
-                                double err_max = -1.0;
+
+                                // R == 0
+                                //EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L)[0],
+                                //        otp.Table_SR[0][Tpair][Opair][L][0], table_tol);
+                                //EXPECT_NEAR(S_dtab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+                                //        otp.Table_SR[1][Tpair][Opair][L][0], table_tol);
+                                //EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+                                //        otp.Table_TR[0][Tpair][Opair][L][0], table_tol * 10);
+                                //EXPECT_NEAR(T_dtab.table(T1, L1, N1, T2, L2, N2, L)[0], 
+                                //        otp.Table_TR[1][Tpair][Opair][L][0], table_tol * 100);
+
+                                // R > 0
                                 for (int ir = 1; ir != 1600; ++ir) {
-                                    //double err = std::abs(T_dtab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir]
-                                    //            - otp.Table_TR[1][Tpair][Opair][L][ir]);
-                                    //if ( err > table_tol) {
-                                    //    if (err_max < 0) {
-                                    //        printf("%i  %i  %i  %i  %i  %i  %i  %5i", T1, L1, N1, T2, L2, N2, L, ir);
-                                    //        err_max = err;
-                                    //    } else {
-                                    //        err_max = std::max(err_max, err);
-                                    //    }
-                                    //}
-
-                                    // table whose integrand has a higher order of k is less accurate
-                                    // as it requires a larger ecutwfc to converge.
-                                    EXPECT_NEAR(T_tab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir], 
-                                            otp.Table_TR[0][Tpair][Opair][L][ir], table_tol * 10);
-                                    EXPECT_NEAR(S_tab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir], 
+                                    double rl = std::pow(ir * dr, L);
+                                    EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl,
                                             otp.Table_SR[0][Tpair][Opair][L][ir], table_tol);
-                                    EXPECT_NEAR(T_dtab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir], 
-                                            otp.Table_TR[1][Tpair][Opair][L][ir], table_tol * 100);
-                                    EXPECT_NEAR(S_dtab.ptr_table(T1, L1, N1, T2, L2, N2, L)[ir], 
+                                    EXPECT_NEAR(S_tab.table(T1, L1, N1, T2, L2, N2, L, true)[ir], 
                                             otp.Table_SR[1][Tpair][Opair][L][ir], table_tol);
-                                }
-                                if (err_max > 0) {
-                                    printf("   %8.5e\n", err_max);
+                                    EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L)[ir] * rl, 
+                                            otp.Table_TR[0][Tpair][Opair][L][ir], table_tol * 10);
+                                    EXPECT_NEAR(T_tab.table(T1, L1, N1, T2, L2, N2, L, true)[ir], 
+                                            otp.Table_TR[1][Tpair][Opair][L][ir], table_tol * 100);
                                 }
 							}
 						}
@@ -319,8 +316,6 @@ TEST_F(TwoCenterTableTest, LegacyConsistency)
 		}
 	}
 
-
-    //////////////////////////////////////////////
     otp.Destroy_Table(lcao);
 }
 
diff --git a/source/module_basis/module_nao/two_center_integrator.cpp b/source/module_basis/module_nao/two_center_integrator.cpp
new file mode 100644
index 0000000000..bef6563891
--- /dev/null
+++ b/source/module_basis/module_nao/two_center_integrator.cpp
@@ -0,0 +1,127 @@
+#include "module_basis/module_nao/two_center_integrator.h"
+
+#include "module_base/vector3.h"
+#include "module_base/ylm.h"
+
+TwoCenterIntegrator::TwoCenterIntegrator() : 
+    is_tabulated_(false),
+    op_('\0'),
+    with_deriv_(false),
+    use_internal_gaunt_(false),
+    rgt_(nullptr)
+{
+}
+
+TwoCenterIntegrator::~TwoCenterIntegrator()
+{
+    if (use_internal_gaunt_)
+    {
+        delete rgt_;
+    }
+}
+
+void TwoCenterIntegrator::tabulate(const RadialCollection& bra,
+                                 const RadialCollection& ket,
+                                 const char op,
+                                 const int nr,
+                                 const double cutoff,
+                                 const bool with_deriv,
+                                 RealGauntTable* const rgt)
+{
+    op_ = op;
+    with_deriv_ = with_deriv;
+    table_.build(bra, ket, op, nr, cutoff, with_deriv);
+
+    if (rgt)
+    { // if an external gaunt table is provided
+        if (use_internal_gaunt_)
+        {
+            delete rgt_;    
+            use_internal_gaunt_ = false;
+        }
+        rgt_ = rgt;
+    }
+    else
+    { // if no external gaunt table is provided (which implies an internal one)
+        if (!use_internal_gaunt_)
+        {
+            rgt_ = new RealGauntTable;
+            use_internal_gaunt_ = true;
+        }
+    }
+
+    rgt_->build(std::max(bra.lmax(), ket.lmax()));
+    is_tabulated_ = true;
+}
+
+void TwoCenterIntegrator::calculate(const int itype1, 
+                                    const int l1, 
+                                    const int izeta1, 
+                                    const int m1, 
+                                    const int itype2,
+                                    const int l2,
+                                    const int izeta2,
+                                    const int m2,
+	                                const ModuleBase::Vector3<double>& vR, // R = R2 - R1
+                                    const bool deriv,
+                                    double* out) const
+{
+    assert( is_tabulated_ );
+    assert( (deriv && with_deriv_) || !deriv );
+
+    double R = vR.norm();
+
+    // unit vector along R
+    ModuleBase::Vector3<double> uR = (R == 0.0 ? ModuleBase::Vector3<double>(0., 0., 1.) : vR / R);
+
+    std::fill(out, out + (deriv ? 3 : 1), 0.0);
+
+    // generate all necessary real spherical harmonics (multiplied by R^l)
+	std::vector<double> Rl_Y;
+	std::vector<std::vector<double>> grad_Rl_Y;
+
+	if (deriv)
+	{
+		ModuleBase::Ylm::grad_rl_sph_harm(l1 + l2, vR[0], vR[1], vR[2], Rl_Y, grad_Rl_Y);
+	}
+	else
+	{
+		ModuleBase::Ylm::rl_sph_harm(l1 + l2, vR[0], vR[1], vR[2], Rl_Y);
+	}
+
+    int sign = 1;
+    for (int l = std::abs(l1 - l2); l <= l1 + l2; l += 2)
+    {
+        double S_by_Rl = table_.lookup(itype1, l1, izeta1, itype2, l2, izeta2, l, R, false);
+
+        // tmp is (d/dR)(S/R^l)
+        double tmp = (deriv && R > 1e-6) ? 
+            table_.lookup(itype1, l1, izeta1, itype2, l2, izeta2, l, R, true) / std::pow(R, l)
+                - l / R * S_by_Rl
+            : 0.0;
+
+		for (int m = -l; m < l; ++m)
+        {
+            double G = (*rgt_)(l1, l2, l, m1, m2, m);
+
+            if (deriv)
+            {
+                for (int i = 0; i < 3; ++i)
+                {
+                    out[i] += sign * G * ( tmp * uR[i] * Rl_Y[ylm_index(l, m)] 
+                                            + S_by_Rl * grad_Rl_Y[ylm_index(l, m)][i] );
+                }
+            }
+            else
+            {
+                out[0] += sign * G * S_by_Rl * Rl_Y[ylm_index(l, m)];
+            }
+        }
+        sign = -sign;
+    }
+}
+
+int TwoCenterIntegrator::ylm_index(const int l, const int m) const
+{
+    return l * l + (m > 0 ? 2 * m - 1 : -2 * m);
+}
diff --git a/source/module_basis/module_nao/two_center_integrator.h b/source/module_basis/module_nao/two_center_integrator.h
new file mode 100644
index 0000000000..0dbbf456f2
--- /dev/null
+++ b/source/module_basis/module_nao/two_center_integrator.h
@@ -0,0 +1,123 @@
+#ifndef TWO_CENTER_INTEGRATOR_H_
+#define TWO_CENTER_INTEGRATOR_H_
+
+#include "module_basis/module_nao/two_center_table.h"
+#include "module_basis/module_nao/real_gaunt_table.h"
+#include "module_basis/module_nao/radial_collection.h"
+#include "module_base/vector3.h"
+
+/**
+ * @brief A class to compute two-center integrals
+ *
+ * This class computes two-center integrals
+ *
+ *                     /    
+ *              I(R) = | dr phi1(r) (op) phi2(r - R)
+ *                     /               
+ *
+ * as well as their gradients, where op is 1 (overlap) or minus Laplacian (kinetic),
+ * and phi1, phi2 are "atomic-orbital-like" functions of the form
+ *
+ *              phi(r) = chi(|r|) * Ylm(r/|r|)
+ *
+ * where chi is some numerical radial function and Ylm is some real spherical harmonics.
+ *
+ * This class is designed to efficiently compute the two-center integrals between 
+ * two "collections" of the above functions with various R, e.g., the overlap integrals 
+ * between all numerical atomic orbitals and all Kleinman-Bylander nonlocal projectors,
+ * the overlap & kinetic integrals between all numerical atomic orbitals, etc.
+ * This is done by tabulating the radial part of the integrals on an r-space grid and
+ * the real Gaunt coefficients in advance.
+ *
+ * See the developer's document for more details.
+ *                                                                                      */
+class TwoCenterIntegrator
+{
+  public:
+    TwoCenterIntegrator();
+    TwoCenterIntegrator(const TwoCenterIntegrator&) = delete;
+    TwoCenterIntegrator& operator=(const TwoCenterIntegrator&) = delete;
+
+    ~TwoCenterIntegrator();
+
+    /**
+     * @brief Tabulate the radial part of the two-center integrals
+     *
+     * @param[in] bra          The radial functions of the first collection.
+     * @param[in] ket          The radial functions of the second collection.
+     * @param[in] op           Operator, could be 'S' or 'T'.
+     * @param[in] nr           Number of r-space grid points.
+     * @param[in] cutoff       r-space cutoff radius.
+     * @param[in] with_deriv   If true, the derivative of radial table is also tabulated.
+     *                         This is necessary to compute the gradient of integrals.
+     * @param[in] rgt          Pointer to a real Gaunt table.
+     *                                                                                  */
+    void tabulate(const RadialCollection& bra,
+                  const RadialCollection& ket,
+                  const char op,
+                  const int nr,
+                  const double cutoff,
+                  const bool with_deriv,
+                  RealGauntTable* const rgt = nullptr
+    );
+
+    /**
+     * @brief Compute the two-center integrals
+     *
+     * This function calculates the two-center integral
+     *
+     *                     /    
+     *              I(R) = | dr phi1(r) (op_) phi2(r - R)
+     *                     /               
+     *
+     * or its gradient by using the tabulated radial part and real Gaunt coefficients.
+     *
+     * @param[in] itype1       Element index of orbital 1.
+     * @param[in] l1           Angular momentum of orbital 1.
+     * @param[in] izeta1       Zeta number of orbital 1.
+     * @param[in] m1           Magnetic quantum number of orbital 1.
+     * @param[in] itype2       Element index of orbital 2.
+     * @param[in] l2           Angular momentum of orbital 2.
+     * @param[in] izeta2       Zeta number of orbital 2.
+     * @param[in] m2           Magnetic quantum number of orbital 2.
+     * @param[in] vR           R2 - R1.
+     * @param[in] deriv        If true, the gradient of integrals is computed.
+     * @param[out] out         The computed integrals. If deriv is true, out[0], out[1]
+     *                         and out[2] are the x, y, z components of the gradient.
+     *                                                                                  */
+    void calculate(const int itype1, 
+                   const int l1, 
+                   const int izeta1, 
+                   const int m1, 
+                   const int itype2,
+                   const int l2,
+                   const int izeta2,
+                   const int m2,
+	               const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
+                   const bool deriv,
+                   double* out
+    ) const;
+
+  private:
+    bool is_tabulated_;
+
+    char op_;
+    bool with_deriv_;
+    bool use_internal_gaunt_;
+
+    TwoCenterTable table_;
+    RealGauntTable* rgt_;
+
+    /**
+     * @brief Compute the index of (l,m) in the array of spherical harmonics
+     *
+     * Spherical harmonics in ABACUS are stored in the following order:
+     *
+     * index  0   1   2   3   4   5   6   7   8   9  10  11  12 ...
+     *   l    0   1   1   1   2   2   2   2   2   3   3   3   3 ...
+     *   m    0   0   1  -1   0   1  -1   2  -2   0   1  -1   2 ...
+     *                                                                                  */
+    int ylm_index(const int l, const int m) const;
+};
+
+#endif
diff --git a/source/module_basis/module_nao/two_center_table.cpp b/source/module_basis/module_nao/two_center_table.cpp
index 762defedfc..6b6db1bb57 100644
--- a/source/module_basis/module_nao/two_center_table.cpp
+++ b/source/module_basis/module_nao/two_center_table.cpp
@@ -3,25 +3,32 @@
 #include <algorithm>
 #include <cstring>
 #include <iostream>
+#include <limits>
+#include <numeric>
 
-TwoCenterTable::~TwoCenterTable() { delete[] rgrid_; }
+#include "module_base/constants.h"
+#include "module_base/math_integral.h"
 
 void TwoCenterTable::build(const RadialCollection& bra,
                            const RadialCollection& ket,
                            const char op,
                            const int nr,
-                           const double* const rgrid,
-                           const bool deriv)
+                           const double cutoff,
+                           const bool with_deriv)
 {
-    assert(nr > 0 && rgrid);
+    assert(nr >= 4 && cutoff > 0.0); // nr >= 4 required for polynomial interpolation
+
     cleanup();
 
     op_ = op;
-    is_deriv_ = deriv;
+    with_deriv_ = with_deriv;
 
     nr_ = nr;
-    rgrid_ = new double[nr_];
-    std::memcpy(rgrid_, rgrid, nr_ * sizeof(double));
+    rmax_ = cutoff;
+
+    double* rgrid = new double[nr_];
+    double dr = rmax_ / (nr_ - 1);
+    std::for_each(rgrid, rgrid + nr_, [&rgrid, dr](double& r) { r = (&r - rgrid) * dr; });
 
     // index the table by generating a map from (itype1, l1, izeta1, itype2, l2, izeta2, l) to a row index
     index_map_.resize({bra.ntype(), bra.lmax() + 1, bra.nzeta_max(),
@@ -29,62 +36,69 @@ void TwoCenterTable::build(const RadialCollection& bra,
     std::fill(index_map_.data<int>(), index_map_.data<int>() + index_map_.NumElements(), -1);
 
     ntab_ = 0;
+    two_center_loop(bra, ket, &TwoCenterTable::_indexing);
 
-    for (int l = 0; l <= bra.lmax() + ket.lmax(); ++l)
-    {
-        for (int l1 = 0; l1 <= bra.lmax(); ++l1)
-        {
-            for (const NumericalRadial** it1 = bra.cbegin(l1); it1 != bra.cend(l1); ++it1)
-            {
-                for (int l2 = std::abs(l1 - l); l2 <= std::min(ket.lmax(), l + l1); l2 += 2)
-                {
-                    for (const NumericalRadial** it2 = ket.cbegin(l2); it2 != ket.cend(l2); ++it2)
-                    {
-                        table_index(*it1, *it2, l) = ntab_;
-                        ++ntab_;
-                    }
-                }
-            }
-        }
-    }
-
-    // irow is now the number of rows in the table
     table_.resize({ntab_, nr_});
+    two_center_loop(bra, ket, &TwoCenterTable::_tabulate);
 
-    for (int l = 0; l <= bra.lmax() + ket.lmax(); ++l)
+    if (with_deriv_)
     {
-        for (int l1 = 0; l1 <= bra.lmax(); ++l1)
-        {
-            for (const NumericalRadial** it1 = bra.cbegin(l1); it1 != bra.cend(l1); ++it1)
-            {
-                for (int l2 = std::abs(l1 - l); l2 <= std::min(ket.lmax(), l + l1); l2 += 2)
-                {
-                    for (const NumericalRadial** it2 = ket.cbegin(l2); it2 != ket.cend(l2); ++it2)
-                    {
-                        (*it1)->radtab(op,
-                                       **it2,
-                                       l,
-                                       table_.inner_most_ptr<double>(table_index(*it1, *it2, l)),
-                                       nr_,
-                                       rgrid_,
-                                       deriv);
-                    }
-                }
-            }
-        }
+        // NOTE: for better performance, the loop of _tabulate_d should not be combined with _tabulate.
+        // This is due to the caching mechanism of SphericalBesselTransformer.
+        dtable_.resize({ntab_, nr_});
+        two_center_loop(bra, ket, &TwoCenterTable::_tabulate_d);
     }
 }
 
-const double* TwoCenterTable::ptr_table(const int itype1,
-                                        const int l1,
-                                        const int izeta1,
-                                        const int itype2,
-                                        const int l2,
-                                        const int izeta2,
-                                        const int l) const
+const double* TwoCenterTable::table(const int itype1,
+                                    const int l1,
+                                    const int izeta1,
+                                    const int itype2,
+                                    const int l2,
+                                    const int izeta2,
+                                    const int l,
+                                    const bool deriv) const
 {
     assert(is_present(itype1, l1, izeta1, itype2, l2, izeta2, l));
-    return table_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l));
+    return deriv ? dtable_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l)):
+                    table_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l));
+}
+
+double TwoCenterTable::lookup(const int itype1,
+                              const int l1,
+                              const int izeta1,
+                              const int itype2,
+                              const int l2,
+                              const int izeta2,
+                              const int l,
+                              const double R,
+                              const bool deriv) const
+{
+    assert(R >= 0);
+
+    if (R > rmax())
+    {
+        return 0.0;
+    }
+
+    const double* tab = table(itype1, l1, izeta1, itype2, l2, izeta2, l, deriv);
+    double dr = rmax_ / (nr_ - 1);
+
+    // find the maximum ir satisfying ir * dr <= R
+    int ir = static_cast<int>(R / dr);
+
+    // adjust ir so that (ir-1, ir, ir+1, ir+2) all fall within the boundary
+    ir += (ir == 0);
+    ir -= (ir >= nr_ - 2) + (ir == nr_ -1);
+
+    // apply Lagrange interpolation to data points with indices ir-1, ir, ir+1, ir+2
+    double dx0 = R / dr - ir + 1.0;
+    double dx1 = dx0 - 1.0;
+    double dx2 = dx1 - 1.0;
+    double dx3 = dx2 - 1.0;
+
+    return dx1 * dx2 * (tab[ir+2] * dx0 - tab[ir-1] * dx3) / 6.0 +
+           dx0 * dx3 * (tab[ir  ] * dx2 - tab[ir+1] * dx1) / 2.0;
 }
 
 int& TwoCenterTable::table_index(const NumericalRadial* it1, const NumericalRadial* it2, const int l)
@@ -95,13 +109,11 @@ int& TwoCenterTable::table_index(const NumericalRadial* it1, const NumericalRadi
 void TwoCenterTable::cleanup()
 {
     op_ = '\0';
-    is_deriv_ = false;
+    with_deriv_ = false;
     nr_ = 0;
 
-    delete[] rgrid_;
-    rgrid_ = nullptr;
-
     table_.resize({0});
+    dtable_.resize({0});
     index_map_.resize({0});
 }
 
@@ -121,3 +133,103 @@ bool TwoCenterTable::is_present(const int itype1,
            && izeta2 < index_map_.shape().dim_size(5) && l >= 0 && l <= index_map_.shape().dim_size(6)
            && index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l) >= 0;
 }
+
+double TwoCenterTable::dfact(int l) const
+{
+    double result = 1.0;
+    for (int i = l; i > 1; i -= 2)
+    {
+        result *= i;
+    }
+    return result;
+}
+
+void TwoCenterTable::two_center_loop(const RadialCollection& bra, 
+                                     const RadialCollection& ket, 
+                                     looped_func f)
+{
+    for (int l = 0; l <= bra.lmax() + ket.lmax(); ++l)
+        for (int l1 = 0; l1 <= bra.lmax(); ++l1)
+            for (const NumericalRadial** it1 = bra.cbegin(l1); it1 != bra.cend(l1); ++it1)
+                for (int l2 = std::abs(l1 - l); l2 <= std::min(ket.lmax(), l + l1); l2 += 2)
+                    for (const NumericalRadial** it2 = ket.cbegin(l2); it2 != ket.cend(l2); ++it2)
+                        (this->*f)(*it1, *it2, l);
+}
+
+void TwoCenterTable::_indexing(const NumericalRadial* it1, const NumericalRadial* it2, const int l)
+{
+    table_index(it1, it2, l) = ntab_++;
+}
+
+void TwoCenterTable::_tabulate(const NumericalRadial* it1, const NumericalRadial* it2, const int l)
+{
+    double* tab = table_.inner_most_ptr<double>(table_index(it1, it2, l));
+    it1->radtab(op_, *it2, l, tab, nr_, rmax_, false);
+
+    // NOTE: 
+    // A radial table stores S(R)/R^l or T(R)/R^l instead of bare S/T.
+    //
+    // When calculating two-center integrals, we need the product between
+    // S(or T) and Y. Note that spherical harmonics are given as R^l * Y_lm,
+    // thus the following limit (or the T(R) counterpart)
+    //
+    //                        S(R)
+    //                  lim   ----
+    //                  R->0    l
+    //                         R
+    //
+    // is explicited required and should be stored in the table. In order
+    // to maintain the consistency of the table, we choose to store S(R)/R^l
+    // not only for R->0 but also for R>0.
+    //
+    // See the developer's document for more details.
+    double dr = rmax_ / (nr_ - 1);
+    if ( l > 0 )
+    {
+        //std::transform(&tab[1], tab + nr_, &rgrid[1], &tab[1],
+        //        [&](double val, double r) { return val / std::pow(r, l); });
+        std::for_each(&tab[1], tab + nr_, [&](double& val) { val /= std::pow(dr * (&val - tab), l); });
+
+        // special treatment for R=0
+        int nk = it1->nk();
+        const double* kgrid = it1->ptr_kgrid();
+
+        double* fk = new double[nk];
+        double* h = new double[nk];
+        std::adjacent_difference(kgrid, kgrid + nk, h);
+
+        int op_exp = l;
+        switch (op_)
+        {
+        case 'S': op_exp += 2;
+                  break;
+        case 'T': op_exp += 4;
+                  break;
+        default: ; // currently not supposed to happen
+        }
+
+        for (int ik = 0; ik != nk; ++ik)
+        {
+            fk[ik] = it1->ptr_kvalue()[ik] * it2->ptr_kvalue()[ik] 
+                    * std::pow(kgrid[ik], op_exp);
+        }
+
+        tab[0] = ModuleBase::Integral::simpson(nk, fk, &h[1]) 
+                * ModuleBase::FOUR_PI / dfact(2 * l + 1);
+
+        delete[] fk;
+        delete[] h;
+    }
+}
+
+void TwoCenterTable::_tabulate_d(const NumericalRadial* it1, const NumericalRadial* it2, const int l)
+{
+    // The R->0 issue does not occur in the derivative calculation
+    // because the term which involves dS/dR or dT/dR vanishes at R=0.
+    // Therefore, there's no need to find the limit of (dS/dR)/R^(l-1) at R=0,
+    // and we choose to store dS/dR or dT/dR directly.
+    //
+    // See the developer's document for more details.
+    it1->radtab(op_, *it2, l, dtable_.inner_most_ptr<double>(table_index(it1, it2, l)), nr_, rmax_, true);
+}
+
diff --git a/source/module_basis/module_nao/two_center_table.h b/source/module_basis/module_nao/two_center_table.h
index 9aed451d4e..47034ef88d 100644
--- a/source/module_basis/module_nao/two_center_table.h
+++ b/source/module_basis/module_nao/two_center_table.h
@@ -8,15 +8,18 @@
 class TwoCenterTable
 {
   public:
-    TwoCenterTable() {}
-    ~TwoCenterTable();
-
-    void build(const RadialCollection& bra, //!< [in] radial collection involved in <bra|op|ket>
-               const RadialCollection& ket, //!< [in] radial collection involved in <bra|op|ket>
-               const char op,               //!< [in] operator of the two-center integral
-               const int nr,                //!< [in] number of table grid points
-               const double* const rgrid,   //!< [in] table grid
-               const bool deriv = false     //!< [in] if the target is a derivative table
+    TwoCenterTable() = default;
+    ~TwoCenterTable() = default;
+
+    TwoCenterTable(const TwoCenterTable&) = delete;
+    TwoCenterTable& operator=(const TwoCenterTable&) = delete;
+
+    void build(const RadialCollection& bra,  //!< [in] radial collection involved in <bra|op|ket>
+               const RadialCollection& ket,  //!< [in] radial collection involved in <bra|op|ket>
+               const char op,                //!< [in] operator of the two-center integral
+               const int nr,                 //!< [in] number of table grid points
+               const double cutoff,          //!< [in] cutoff radius of the table
+               const bool with_deriv = false //!< [in] if the derivative of radial table is also built
     );
 
     /*!
@@ -24,7 +27,7 @@ class TwoCenterTable
      *                                                                                      */
     //!@{
     //! returns true if the table stores the derivative of the radial table
-    bool is_deriv() const { return is_deriv_; }
+    bool with_deriv() const { return with_deriv_; }
 
     //! returns the operator of the two-center integral
     char op() const { return op_; }
@@ -36,34 +39,46 @@ class TwoCenterTable
     int ntab() const { return ntab_; }
 
     //! returns the radius cutoff of the table
-    double rmax() const { return rgrid_ ? rgrid_[nr_ - 1] : 0.0; }
-
-    //! gets the pointer to the table's grid points (read-only)
-    const double* ptr_rgrid() const { return rgrid_; }
-
-    //! gets the read-only pointer to a specific table entry
-    const double* ptr_table(const int itype1, //!< [in] element index of chi1
-                            const int l1,     //!< [in] angular momentum of chi1
-                            const int izeta1, //!< [in] zeta number of chi1
-                            const int itype2, //!< [in] element index of chi2
-                            const int l2,     //!< [in] angular momentum of chi2
-                            const int izeta2, //!< [in] zeta number of chi2
-                            const int l       //!< [in] angular momentum of the entry
+    double rmax() const { return rmax_; }
+
+    //! gets the read-only pointer to a specific table
+    const double* table(const int itype1,        //!< [in] element index of chi1
+                        const int l1,            //!< [in] angular momentum of chi1
+                        const int izeta1,        //!< [in] zeta number of chi1
+                        const int itype2,        //!< [in] element index of chi2
+                        const int l2,            //!< [in] angular momentum of chi2
+                        const int izeta2,        //!< [in] zeta number of chi2
+                        const int l,             //!< [in] angular momentum of the entry
+                        const bool deriv = false //!< [in] if true, return the derivative table
+    ) const;
+
+    double lookup(const int itype1,        //!< [in] element index of chi1
+                  const int l1,            //!< [in] angular momentum of chi1
+                  const int izeta1,        //!< [in] zeta number of chi1
+                  const int itype2,        //!< [in] element index of chi2
+                  const int l2,            //!< [in] angular momentum of chi2
+                  const int izeta2,        //!< [in] zeta number of chi2
+                  const int l,             //!< [in] angular momentum of the entry
+                  const double R,          //!< [in] distance between the two centers
+                  const bool deriv = false //!< [in] if true, look up the derivative table
     ) const;
     //!@}
 
   private:
-    char op_ = '\0';        //!< operator of the two-center integral
-    bool is_deriv_ = false; //!< if true, table_ stores the derivative of the radial table
+    char op_ = '\0';          //!< operator of the two-center integral
+    bool with_deriv_ = false; //!< if true, dtable_ is also built
 
-    int nr_ = 0;              //!< number of radial points of each table
-    double* rgrid_ = nullptr; //!< radial grid of each table
+    int ntab_ = 0;     //!< number of table entries
+    int nr_ = 0;       //!< number of radial points of each table
+    double rmax_= 0.0; //!< cutoff radius of the table
 
-    int ntab_ = 0; //!< number of table entries
 
     //! two-center integral radial table, stored as a row-major matrix
     container::Tensor table_{container::DataType::DT_DOUBLE, container::TensorShape({0})};
 
+    //! derivative of table_, built if with_deriv_ is true
+    container::Tensor dtable_{container::DataType::DT_DOUBLE, container::TensorShape({0})};
+
     //! map (itype1, l1, izeta1, itype2, l2, izeta2, l) to a row index in the table
     container::Tensor index_map_{container::DataType::DT_INT, container::TensorShape({0})};
 
@@ -81,6 +96,22 @@ class TwoCenterTable
                     const int l2,
                     const int izeta2,
                     const int l) const;
+
+    /// double factorial
+    double dfact(int l) const;
+
+    typedef void(TwoCenterTable::*looped_func)(const NumericalRadial*, const NumericalRadial*, const int);
+
+    /// executes a looped function over all combinations of two radial functions & l with non-vanishing Gaunt coefficients
+    void two_center_loop(const RadialCollection& bra, 
+                         const RadialCollection& ket, 
+                         looped_func f);
+
+    /// various looped functions during the construction of table
+    void _indexing(const NumericalRadial* it1, const NumericalRadial* it2, const int l);
+    void _tabulate(const NumericalRadial* it1, const NumericalRadial* it2, const int l);
+    void _tabulate_d(const NumericalRadial* it1, const NumericalRadial* it2, const int l);
 };
 
+
 #endif