Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/ctrl_msg/messages.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ const (
ErrorTypeMisconfiguredTerminator = 2
ErrorTypeDialTimedOut = 3
ErrorTypeConnectionRefused = 4
ErrorTypeRejectedByApplication = 5
ErrorTypeDnsResolutionFailed = 6
ErrorTypePortNotAllowed = 7
ErrorTypeInvalidLinkDestination = 8
ErrorTypeResourcesNotAvailable = 9

CreateCircuitPeerDataHeader = 10

Expand Down
5 changes: 5 additions & 0 deletions controller/network/circuit_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ const (
CircuitFailureRouterErrMisconfiguredTerminator CircuitFailureCause = "ROUTER_ERR_MISCONFIGURED_TERMINATOR"
CircuitFailureRouterErrDialTimedOut CircuitFailureCause = "ROUTER_ERR_DIAL_TIMED_OUT"
CircuitFailureRouterErrDialConnRefused CircuitFailureCause = "ROUTER_ERR_CONN_REFUSED"
CircuitFailureRouterErrRejectedByApp CircuitFailureCause = "ROUTER_ERR_REJECTED_BY_APPLICATION"
CircuitFailureRouterErrDnsResolutionFailed CircuitFailureCause = "ROUTER_ERR_DNS_RESOLUTION_FAILED"
CircuitFailureRouterErrPortNotAllowed CircuitFailureCause = "ROUTER_ERR_PORT_NOT_ALLOWED"
CircuitFailureRouterErrInvalidLinkDest CircuitFailureCause = "ROUTER_ERR_INVALID_LINK_DESTINATION"
CircuitFailureRouterErrResourcesNotAvailable CircuitFailureCause = "ROUTER_ERR_RESOURCES_NOT_AVAILABLE"
)

type CircuitError interface {
Expand Down
14 changes: 14 additions & 0 deletions controller/network/routesender.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,20 @@ func (self *routeSender) handleRouteSend(attempt uint32, path *model.Path, strat
case ctrl_msg.ErrorTypeConnectionRefused:
self.serviceCounters.ServiceTerminatorConnectionRefused(terminator.GetServiceId(), terminator.GetId())
failureCause = CircuitFailureRouterErrDialConnRefused
case ctrl_msg.ErrorTypeRejectedByApplication:
self.serviceCounters.ServiceDialOtherError(terminator.GetServiceId())
failureCause = CircuitFailureRouterErrRejectedByApp
case ctrl_msg.ErrorTypeDnsResolutionFailed:
self.serviceCounters.ServiceDialOtherError(terminator.GetServiceId())
failureCause = CircuitFailureRouterErrDnsResolutionFailed
case ctrl_msg.ErrorTypePortNotAllowed:
self.serviceCounters.ServiceDialOtherError(terminator.GetServiceId())
failureCause = CircuitFailureRouterErrPortNotAllowed
case ctrl_msg.ErrorTypeInvalidLinkDestination:
failureCause = CircuitFailureRouterErrInvalidLinkDest
case ctrl_msg.ErrorTypeResourcesNotAvailable:
self.serviceCounters.ServiceDialOtherError(terminator.GetServiceId())
failureCause = CircuitFailureRouterErrResourcesNotAvailable
default:
logger.WithField("errorCode", status.ErrorCode).Error("unhandled error code")
}
Expand Down
1 change: 1 addition & 0 deletions quickstart/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ services:
image: "${ZITI_IMAGE}:${ZITI_VERSION}"
healthcheck:
test: curl -m 1 -s -k https://${ZITI_CTRL_EDGE_ADVERTISED_ADDRESS:-ziti-edge-controller}:${ZITI_CTRL_EDGE_ADVERTISED_PORT:-1280}/edge/client/v1/version
start_period: 30s
interval: 1s
timeout: 3s
retries: 30
Expand Down
19 changes: 14 additions & 5 deletions quickstart/docker/image/access-control.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,17 @@
echo "*****************************************************"
#### Add service policies

# Allow all identities to use any edge router with the "public" attribute
ziti edge create edge-router-policy all-endpoints-public-routers --edge-router-roles "#public" --identity-roles "#all"

# Allow all edge-routers to access all services
ziti edge create service-edge-router-policy all-routers-all-services --edge-router-roles "#all" --service-roles "#all"
# Retry in case the controller's Raft cluster hasn't elected a leader yet
_retries=20
while true; do
if ziti edge create edge-router-policy all-endpoints-public-routers --edge-router-roles "#public" --identity-roles "#all" 2>&1 \
&& ziti edge create service-edge-router-policy all-routers-all-services --edge-router-roles "#all" --service-roles "#all" 2>&1; then
break
fi
if (( --_retries == 0 )); then
echo "ERROR: failed to create access control policies after retries" >&2
exit 1
fi
echo "INFO: waiting for controller to be ready for policy creation (${_retries} retries left)..."
sleep 3
done
22 changes: 17 additions & 5 deletions quickstart/docker/image/run-router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,26 @@ if [ ! -f "${_CONFIG_PATH}" ]; then
fi


if "${ZITI_BIN_DIR-}/ziti" edge list edge-routers "name = \"${ZITI_ROUTER_NAME}\"" --csv | grep -q "${ZITI_ROUTER_NAME}"; then
echo "---------- Found existing edge-router ${ZITI_ROUTER_NAME}...."
else
"${ZITI_BIN_DIR}/ziti" edge create edge-router "${ZITI_ROUTER_NAME}" -o "${ZITI_HOME}/${ZITI_ROUTER_NAME}.jwt" -t -a "${ZITI_ROUTER_ROLES}"
sleep 1
# Retry the edge-router creation in case the controller's Raft cluster hasn't elected a leader yet
_retries=20
while ! "${ZITI_BIN_DIR-}/ziti" edge list edge-routers "name = \"${ZITI_ROUTER_NAME}\"" --csv 2>/dev/null | grep -q "${ZITI_ROUTER_NAME}"; do
if "${ZITI_BIN_DIR}/ziti" edge create edge-router "${ZITI_ROUTER_NAME}" -o "${ZITI_HOME}/${ZITI_ROUTER_NAME}.jwt" -t -a "${ZITI_ROUTER_ROLES}" 2>&1; then
break
fi
if (( --_retries == 0 )); then
echo "ERROR: failed to create edge-router ${ZITI_ROUTER_NAME} after retries" >&2
exit 1
fi
echo "INFO: waiting for controller to be ready to create edge-router (${_retries} retries left)..."
sleep 3
done

if [ -f "${ZITI_HOME}/${ZITI_ROUTER_NAME}.jwt" ]; then
echo "---------- Enrolling edge-router ${ZITI_ROUTER_NAME}...."
"${ZITI_BIN_DIR}/ziti" router enroll "${ZITI_HOME}/${ZITI_ROUTER_NAME}.yaml" --jwt "${ZITI_HOME}/${ZITI_ROUTER_NAME}.jwt"
echo ""
else
echo "---------- Found existing edge-router ${ZITI_ROUTER_NAME}...."
fi
else
echo " Found existing config file ${_CONFIG_PATH}, not creating a new config."
Expand Down
1 change: 1 addition & 0 deletions quickstart/docker/simplified-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ services:
image: "${ZITI_IMAGE}:${ZITI_VERSION}"
healthcheck:
test: curl -m 1 -s -k -f https://${ZITI_CTRL_EDGE_ADVERTISED_ADDRESS:-ziti-edge-controller}:${ZITI_CTRL_EDGE_ADVERTISED_PORT:-1280}/edge/client/v1/version
start_period: 30s
interval: 3s
timeout: 3s
retries: 60
Expand Down
9 changes: 7 additions & 2 deletions quickstart/test/compose-test.zsh
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,21 @@ fi

# wait for the controller and router to be ready and run the certificate check script; NOUNSET option is enabled after
# sourcing quickstart functions and env because there are some unset variables in those
docker compose exec ziti-controller \
if ! docker compose exec ziti-controller \
bash -eo pipefail -c '
source "${ZITI_SCRIPTS}/ziti-cli-functions.sh" >/dev/null;
echo "INFO: waiting for controller";
source /persistent/ziti.env >/dev/null;
_wait_for_controller >/dev/null;
echo "INFO: waiting for public router";
source /persistent/ziti.env >/dev/null;
_wait_for_public_router >/dev/null;
_wait_for_public_router;
'
then
echo "ERROR: router failed to come up, dumping compose logs"
docker compose logs
exit 1
fi
# TODO: re-add cert checks to above test suite after https://github.com/openziti/ziti/pull/1278
# zsh /persistent/check-cert-chains.zsh;
docker compose --profile test run --rm quickstart-test
Expand Down
22 changes: 21 additions & 1 deletion router/forwarder/forwarder.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ import (
"github.com/sirupsen/logrus"
)

// InvalidLinkDestinationError indicates a route referenced a link destination that does not exist.
type InvalidLinkDestinationError struct {
LinkId string
}

func (e *InvalidLinkDestinationError) Error() string {
return fmt.Sprintf("invalid link destination %v", e.LinkId)
}

// NewInvalidLinkDestinationError creates an error indicating that the specified link destination is not valid.
func NewInvalidLinkDestinationError(linkId string) error {
return &InvalidLinkDestinationError{LinkId: linkId}
}

// IsInvalidLinkDestinationError returns true if the error is or wraps an InvalidLinkDestinationError.
func IsInvalidLinkDestinationError(err error) bool {
var target *InvalidLinkDestinationError
return errors.As(err, &target)
}

type Forwarder struct {
circuits *circuitTable
destinations *destinationTable
Expand Down Expand Up @@ -131,7 +151,7 @@ func (forwarder *Forwarder) Route(ctrlId string, route *ctrl_pb.Route) error {
if !forwarder.HasDestination(xgress.Address(forward.DstAddress)) {
if forward.DstType == ctrl_pb.DestType_Link {
forwarder.faulter.NotifyInvalidLink(forward.DstAddress)
return fmt.Errorf("invalid link destination %v", forward.DstAddress)
return NewInvalidLinkDestinationError(forward.DstAddress)
}
if forward.DstType == ctrl_pb.DestType_End {
return fmt.Errorf("invalid egress destination %v", forward.DstAddress)
Expand Down
110 changes: 110 additions & 0 deletions router/handler_ctrl/classify_dial_error_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
Copyright NetFoundry Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package handler_ctrl

import (
"fmt"
"net"
"syscall"
"testing"

"github.com/openziti/sdk-golang/xgress"
"github.com/openziti/ziti/v2/common/ctrl_msg"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
)

func Test_classifyDialError(t *testing.T) {
t.Run("connection refused", func(t *testing.T) {
err := fmt.Errorf("dial failed: %w", syscall.ECONNREFUSED)
require.Equal(t, byte(ctrl_msg.ErrorTypeConnectionRefused), classifyDialError(err))
})

t.Run("DNS resolution failure", func(t *testing.T) {
err := &net.DNSError{Err: "no such host", Name: "nonexistent.example.com", IsNotFound: true}
require.Equal(t, byte(ctrl_msg.ErrorTypeDnsResolutionFailed), classifyDialError(err))
})

t.Run("wrapped DNS resolution failure", func(t *testing.T) {
dnsErr := &net.DNSError{Err: "no such host", Name: "nonexistent.example.com", IsNotFound: true}
err := errors.Wrap(dnsErr, "failed to dial")
require.Equal(t, byte(ctrl_msg.ErrorTypeDnsResolutionFailed), classifyDialError(err))
})

t.Run("resource exhaustion EMFILE", func(t *testing.T) {
err := fmt.Errorf("dial failed: %w", syscall.EMFILE)
require.Equal(t, byte(ctrl_msg.ErrorTypeResourcesNotAvailable), classifyDialError(err))
})

t.Run("resource exhaustion ENFILE", func(t *testing.T) {
err := fmt.Errorf("dial failed: %w", syscall.ENFILE)
require.Equal(t, byte(ctrl_msg.ErrorTypeResourcesNotAvailable), classifyDialError(err))
})

t.Run("resource exhaustion ENOBUFS", func(t *testing.T) {
err := fmt.Errorf("dial failed: %w", syscall.ENOBUFS)
require.Equal(t, byte(ctrl_msg.ErrorTypeResourcesNotAvailable), classifyDialError(err))
})

t.Run("timeout", func(t *testing.T) {
err := fmt.Errorf("dial failed: %w", syscall.ETIMEDOUT)
require.Equal(t, byte(ctrl_msg.ErrorTypeDialTimedOut), classifyDialError(err))
})

t.Run("misconfigured terminator", func(t *testing.T) {
err := xgress.MisconfiguredTerminatorError{InnerError: fmt.Errorf("bad address")}
require.Equal(t, byte(ctrl_msg.ErrorTypeMisconfiguredTerminator), classifyDialError(err))
})

t.Run("invalid terminator", func(t *testing.T) {
err := xgress.InvalidTerminatorError{InnerError: fmt.Errorf("not found")}
require.Equal(t, byte(ctrl_msg.ErrorTypeInvalidTerminator), classifyDialError(err))
})

t.Run("port not allowed", func(t *testing.T) {
err := fmt.Errorf("failed to establish connection: port 7070 is not in allowed port ranges")
require.Equal(t, byte(ctrl_msg.ErrorTypePortNotAllowed), classifyDialError(err))
})

t.Run("rejected by application", func(t *testing.T) {
err := fmt.Errorf("failed to establish connection with terminator address abc. error: (rejected by application)")
require.Equal(t, byte(ctrl_msg.ErrorTypeRejectedByApplication), classifyDialError(err))
})

t.Run("generic error", func(t *testing.T) {
err := fmt.Errorf("something unexpected happened")
require.Equal(t, byte(ctrl_msg.ErrorTypeGeneric), classifyDialError(err))
})

// DNS errors should be classified before timeout errors, since *net.DNSError implements net.Error
t.Run("DNS error not classified as timeout", func(t *testing.T) {
err := &net.DNSError{Err: "server misbehaving", Name: "example.com", IsTimeout: true}
require.Equal(t, byte(ctrl_msg.ErrorTypeDnsResolutionFailed), classifyDialError(err),
"DNS errors should be classified as DNS failures even when IsTimeout is true")
})

// ER/T hosted services send errors as strings, so DNS errors lose their type
t.Run("DNS error from string (no such host)", func(t *testing.T) {
err := fmt.Errorf("failed to establish connection: dial tcp: lookup bad.host on 127.0.0.53:53: no such host")
require.Equal(t, byte(ctrl_msg.ErrorTypeDnsResolutionFailed), classifyDialError(err))
})

t.Run("DNS error from string (server misbehaving)", func(t *testing.T) {
err := fmt.Errorf("failed to establish connection: dial tcp: lookup bad.host on 127.0.0.53:53: server misbehaving")
require.Equal(t, byte(ctrl_msg.ErrorTypeDnsResolutionFailed), classifyDialError(err))
})
}
69 changes: 54 additions & 15 deletions router/handler_ctrl/route.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package handler_ctrl

import (
"net"
"strings"
"syscall"
"time"

Expand Down Expand Up @@ -115,7 +116,11 @@ func (rh *routeHandler) HandleReceive(msg *channel.Message, ch channel.Channel)

func (rh *routeHandler) completeRoute(msg *channel.Message, attempt int, route *ctrl_pb.Route, peerData xt.PeerData, log *logrus.Entry) {
if err := rh.forwarder.Route(rh.ch.PeerId(), route); err != nil {
rh.fail(msg, attempt, route, err, ctrl_msg.ErrorTypeGeneric, log)
var errCode byte = ctrl_msg.ErrorTypeGeneric
if forwarder.IsInvalidLinkDestinationError(err) {
errCode = ctrl_msg.ErrorTypeInvalidLinkDestination
}
rh.fail(msg, attempt, route, err, errCode, log)
return
}

Expand Down Expand Up @@ -169,20 +174,7 @@ func (rh *routeHandler) connectEgress(msg *channel.Message, attempt int, ch chan
if peerData, err := dialer.Dial(params); err == nil {
rh.completeRoute(msg, attempt, route, peerData, log)
} else {
var errCode byte

switch {
case errors.Is(err, syscall.ECONNREFUSED):
errCode = ctrl_msg.ErrorTypeConnectionRefused
case isNetworkTimeout(err) || errors.Is(err, syscall.ETIMEDOUT):
errCode = ctrl_msg.ErrorTypeDialTimedOut
case errors.As(err, &xgress.MisconfiguredTerminatorError{}):
errCode = ctrl_msg.ErrorTypeMisconfiguredTerminator
case errors.As(err, &xgress.InvalidTerminatorError{}):
errCode = ctrl_msg.ErrorTypeInvalidTerminator
default:
errCode = ctrl_msg.ErrorTypeGeneric
}
errCode := classifyDialError(err)

rh.fail(msg, attempt, route, errors.Wrapf(err, "error creating route for [c/%s]", route.CircuitId), errCode, log)
}
Expand All @@ -196,6 +188,53 @@ func (rh *routeHandler) connectEgress(msg *channel.Message, attempt int, ch chan
}
}

// classifyDialError maps a dial error to a specific error type code for circuit failure reporting.
func classifyDialError(err error) byte {
switch {
case errors.Is(err, syscall.ECONNREFUSED):
return ctrl_msg.ErrorTypeConnectionRefused
case isResourcesNotAvailable(err):
return ctrl_msg.ErrorTypeResourcesNotAvailable
case isDnsError(err):
return ctrl_msg.ErrorTypeDnsResolutionFailed
case isNetworkTimeout(err) || errors.Is(err, syscall.ETIMEDOUT):
return ctrl_msg.ErrorTypeDialTimedOut
case errors.As(err, &xgress.MisconfiguredTerminatorError{}):
return ctrl_msg.ErrorTypeMisconfiguredTerminator
case errors.As(err, &xgress.InvalidTerminatorError{}):
return ctrl_msg.ErrorTypeInvalidTerminator
case isPortNotAllowedError(err):
return ctrl_msg.ErrorTypePortNotAllowed
case isRejectedByApplicationError(err):
return ctrl_msg.ErrorTypeRejectedByApplication
default:
return ctrl_msg.ErrorTypeGeneric
}
}

func isResourcesNotAvailable(err error) bool {
return errors.Is(err, syscall.EMFILE) || errors.Is(err, syscall.ENFILE) || errors.Is(err, syscall.ENOBUFS)
}

func isDnsError(err error) bool {
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) {
return true
}
// ER/T hosted services send dial errors as strings via the SDK protocol,
// so the *net.DNSError type is lost. Fall back to string matching.
errMsg := err.Error()
return strings.Contains(errMsg, "no such host") || strings.Contains(errMsg, "server misbehaving")
}

func isPortNotAllowedError(err error) bool {
return strings.Contains(err.Error(), "not in allowed port ranges")
}

func isRejectedByApplicationError(err error) bool {
return strings.Contains(err.Error(), "rejected by application")
}

func isNetworkTimeout(err error) bool {
var netErr net.Error
return errors.As(err, &netErr)
Expand Down
Loading
Loading