@@ -1836,10 +1836,23 @@ sendControlMessage(icpkthdr *pkt, int fd, struct sockaddr *addr, socklen_t peerL
18361836 if (gp_interconnect_full_crc )
18371837 addCRC (pkt );
18381838
1839- char errDetail [100 ];
1840- snprintf (errDetail , sizeof (errDetail ), "Send control message: got error with seq %u" , pkt -> seq );
1841- /* Retry for infinite times since we have no retransmit mechanism for control message */
1842- n = sendtoWithRetry (fd , (const char * ) pkt , pkt -> len , 0 , addr , peerLen , -1 , errDetail );
1839+ /* retry 10 times for sending control message */
1840+ int counter = 0 ;
1841+ while (counter < 10 )
1842+ {
1843+ counter ++ ;
1844+ n = sendto (fd , (const char * ) pkt , pkt -> len , 0 , addr , peerLen );
1845+ if (n < 0 )
1846+ {
1847+ if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK )
1848+ continue ;
1849+ else {
1850+ write_log ("sendcontrolmessage: got errno %d" , errno );
1851+ return ;
1852+ }
1853+ }
1854+ break ;
1855+ }
18431856 if (n < pkt -> len )
18441857 write_log ("sendcontrolmessage: got error %d errno %d seq %d" , n , errno , pkt -> seq );
18451858}
@@ -4877,6 +4890,19 @@ sendtoWithRetry(int socket, const void *message, size_t length,
48774890 return n ;
48784891 }
48794892
4893+ /*
4894+ * If the OS can detect an MTU issue on the host network interfaces, we
4895+ * would get EMSGSIZE here. So, bail with a HINT about checking MTU.
4896+ */
4897+ if (errno == EMSGSIZE )
4898+ {
4899+ ereport (ERROR , (errcode (ERRCODE_GP_INTERCONNECTION_ERROR ),
4900+ errmsg ("Interconnect error writing an outgoing packet: %m" ),
4901+ errdetail ("error during sendto() call (error:%d).\n"
4902+ "%s" , save_errno , errDetail ),
4903+ errhint ("check if interface MTU is equal across the cluster and lower than gp_max_packet_size" )));
4904+ }
4905+
48804906 ereport (ERROR , (errcode (ERRCODE_GP_INTERCONNECTION_ERROR ),
48814907 errmsg ("Interconnect error writing an outgoing packet: %m" ),
48824908 errdetail ("error during sendto() call (error:%d).\n"
0 commit comments