Creating a fault-tolerant NAT instance on AWS

Yet another NAT...

Published: Oct 21, 2019 by martoc

With the new NAT Gateway this looks like something of the past, but recently I had to configure a NAT instance in our deployment. This Cloudformation configuration creates an autoscaling group with 1 instance allows recovery. The following diagram shows the resources created by the stack.


This is the template, when the instance is launched, it updates the source/destination IP check, the Resources/LaunchConfig/UserData section it calls the following services.

  • modify-instance-attribute: This disables the source/destination check.
  • replace-route: It creates a route for (Internet) via the NAT instance, using its instance ID.

Optionally, if an elastic IP address is tagged with Name=NAT it will be attached to the instance, for these the following operations are used.

  • describe-addresses: Using a tag filter, it will return EIP.
  • associate-address: Hooks the EPI returned in the previous step with the instance.
   "AWSTemplateFormatVersion": "2010-09-09",
   "Description": "NAT",
   "Parameters": {
      "Name": {
         "Description": "Name to default resources to. Applied as tag to resources",
         "Type": "String"
      "Version": {
         "Description": "Version/Revision of the stack. Applied as tag to resources",
         "Type": "String",
         "Default": "1.0.0"
      "Environment": {
         "Description": "Stage/lifecycle of the application. Applied as a tag to resources",
         "Type": "String"
      "ImageId": {
         "Description": "Ami to be used for NAT server",
         "Type": "String"
      "InstanceType": {
         "Description": "EC2 instance type",
         "Type": "String",
         "Default": "t3.medium",
         "ConstraintDescription": "must be a valid EC2 instance type"
      "VpcId": {
         "Description": "VPC associated with the provided Public/Private Subnets",
         "Type": "AWS::EC2::VPC::Id"
      "PublicSubnets": {
         "Description": "List of VPC subnet IDs for the cluster",
         "Type": "List<AWS::EC2::Subnet::Id>"
      "RouteTableId": {
         "Description": "Route Table Id",
         "Type": "String"
   "Resources": {
      "Role": {
         "Type": "AWS::IAM::Role",
         "Properties": {
            "AssumeRolePolicyDocument": {
                     "Effect": "Allow",
                     "Principal": {
            "Path": "/",
                  "PolicyName": {
                              "Ref": "Name"
                  "PolicyDocument": {
                     "Version": "2012-10-17",
                           "Effect": "Allow",
                           "Resource": "*"
      "InstanceProfile": {
         "Type": "AWS::IAM::InstanceProfile",
         "Properties": {
            "Path": "/",
                  "Ref": "Role"
      "ServerGroup": {
         "Type": "AWS::AutoScaling::AutoScalingGroup",
         "Properties": {
            "HealthCheckType": "EC2",
            "HealthCheckGracePeriod": 300,
            "LaunchConfigurationName": {
               "Ref": "LaunchConfig"
            "VPCZoneIdentifier": {
               "Ref": "PublicSubnets"
                  "Key": "Environment",
                  "Value": {
                     "Ref": "Environment"
                  "PropagateAtLaunch": "true"
                  "Key": "Name",
                  "Value": {
                     "Ref": "Name"
                  "PropagateAtLaunch": "true"
                  "Key": "Version",
                  "Value": {
                     "Ref": "Version"
                  "PropagateAtLaunch": "true"
      "ServerSecurityGroup": {
      "Type": "AWS::EC2::SecurityGroup",
      "Properties": {
        "GroupDescription": "NAT Security Group",
        "VpcId": { "Ref": "VpcId" },
        "SecurityGroupIngress": [
            "IpProtocol": "tcp",
            "FromPort": 1024,
            "ToPort": 65535,
            "CidrIp": ""
            "IpProtocol": "udp",
            "FromPort": 1024,
            "ToPort": 65535,
            "CidrIp": ""
        "Tags": [
            "Key": "Environment",
            "Value": {
              "Ref": "Environment"
            "Key": "Name",
            "Value": {
              "Ref": "Name"
            "Key": "Version",
            "Value": {
              "Ref": "Version"
      "LaunchConfig": {
         "Type": "AWS::AutoScaling::LaunchConfiguration",
         "Properties": {
            "ImageId": {
               "Ref": "ImageId"
            "SecurityGroups": {
               "Ref": "ServerSecurityGroup"
            "AssociatePublicIpAddress": "true",
            "IamInstanceProfile": {
               "Ref": "InstanceProfile"
            "InstanceType": {
               "Ref": "InstanceType"
            "InstanceMonitoring": "true",
            "UserData": {
               "Fn::Base64": {
                       "INSTANCE_ID=$(curl --silent\n",
                       "aws ec2 modify-instance-attribute --region ", { "Ref" : "AWS::Region"}, " --instance-id $INSTANCE_ID --no-source-dest-check\n",
                       "aws ec2 replace-route --region ", { "Ref" : "AWS::Region"}, " --route-table-id ", { "Ref": "RouteTableId" }, " --destination-cidr-block --instance-id $INSTANCE_ID\n",
                       "ELASTIC_IP_ALLOCATION_ID=$(aws ec2 describe-addresses --region ", { "Ref" : "AWS::Region"}, " --filter Name=tag:Name,Values=NAT Name=tag:Type,Values=instance --query Addresses[0].AllocationId | sed -e 's/\"//g')\n",
                       "if [ ! -z \"$ELASTIC_IP_ALLOCATION_ID\" ]; then aws ec2 associate-address --region ", { "Ref" : "AWS::Region"}, " --instance-id $INSTANCE_ID --allocation-id $ELASTIC_IP_ALLOCATION_ID; fi\n"


