Observabilidade é essencial para aplicações em produção. Este guia completo mostra como implementar logging estruturado, métricas, traces e alerting em Go usando as melhores práticas e ferramentas modernas.

Pilares da Observabilidade

┌─────────────────────────────────────────────────────────┐
│                    OBSERVABILITY                        │
├─────────────┬─────────────┬─────────────────────────────┤
│    LOGS     │  METRICS    │          TRACES             │
│             │             │                             │
│ O que       │ Quanto/     │ Onde/O                      │
│ aconteceu   │ Quando      │ quando                      │
│             │             │                             │
│ Eventos     │ Números     │ Request flow                │
│ detalhados  │ agregados   │ distribuído                 │
└─────────────┴─────────────┴─────────────────────────────┘

1. Structured Logging

Logging estruturado é fundamental para debug e análise de incidentes.

slog (Biblioteca Padrão - Go 1.21+)

package main

import (
    "log/slog"
    "os"
    "time"
)

func main() {
    // Logger JSON para produção
    logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
        Level: slog.LevelInfo,
    }))
    
    // Logger texto para desenvolvimento
    devLogger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
        Level: slog.LevelDebug,
    }))
    
    // Logging estruturado
    logger.Info("request completed",
        slog.String("method", "GET"),
        slog.String("path", "/api/users"),
        slog.Int("status", 200),
        slog.Duration("duration", 150*time.Millisecond),
        slog.String("request_id", "abc-123"),
    )
}

Output JSON (Produção)

{
  "time": "2026-02-11T14:30:45Z",
  "level": "INFO",
  "msg": "request completed",
  "method": "GET",
  "path": "/api/users",
  "status": 200,
  "duration": 150000000,
  "request_id": "abc-123"
}

Logger Customizado com Contexto

type contextKey string

const loggerKey contextKey = "logger"

// Logger com campos padrão
type Logger struct {
    *slog.Logger
    service string
    version string
}

func NewLogger(service, version string) *Logger {
    return &Logger{
        Logger: slog.New(slog.NewJSONHandler(os.Stdout, nil)),
        service: service,
        version: version,
    }
}

func (l *Logger) WithRequestID(requestID string) *slog.Logger {
    return l.With(
        slog.String("service", l.service),
        slog.String("version", l.version),
        slog.String("request_id", requestID),
    )
}

// Middleware para injetar logger no contexto
func LoggerMiddleware(logger *Logger) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            requestID := generateRequestID()
            
            ctx := context.WithValue(r.Context(), loggerKey, 
                logger.WithRequestID(requestID))
            
            w.Header().Set("X-Request-ID", requestID)
            next.ServeHTTP(w, r.WithContext(ctx))
        })
    }
}

// Recuperar logger do contexto
func LoggerFromContext(ctx context.Context) *slog.Logger {
    if logger, ok := ctx.Value(loggerKey).(*slog.Logger); ok {
        return logger
    }
    return slog.Default()
}

Níveis de Log Apropriados

// DEBUG - Informação detalhada para desenvolvimento
logger.Debug("database query",
    slog.String("sql", "SELECT * FROM users"),
    slog.Any("params", params),
)

// INFO - Eventos normais da aplicação
logger.Info("user created",
    slog.String("user_id", user.ID),
    slog.String("email", user.Email),
)

// WARN - Situações anômalas mas recuperáveis
logger.Warn("slow query detected",
    slog.Duration("duration", 5*time.Second),
    slog.String("query", query),
)

// ERROR - Erros que precisam de atenção
logger.Error("payment processing failed",
    slog.String("order_id", orderID),
    slog.String("error", err.Error()),
    slog.String("user_id", userID),
)

2. Métricas com Prometheus

Métricas permitem monitorar a saúde e performance da aplicação.

Tipos de Métricas

package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    // Counter - valores que só aumentam
    RequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total de requisições HTTP",
        },
        []string{"method", "path", "status"},
    )
    
    // Gauge - valores que podem subir ou descer
    ActiveConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Conexões ativas no momento",
        },
    )
    
    // Histogram - distribuição de valores (latência)
    RequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "Duração das requisições HTTP",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "path"},
    )
    
    // Summary - similar ao histogram com quantis configuráveis
    ResponseSize = promauto.NewSummaryVec(
        prometheus.SummaryOpts{
            Name:       "http_response_size_bytes",
            Help:       "Tamanho das respostas HTTP",
            Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
        },
        []string{"method", "path"},
    )
)

Middleware de Métricas HTTP

package middleware

import (
    "net/http"
    "strconv"
    "time"
    
    "github.com/prometheus/client_golang/prometheus"
    "myapp/metrics"
)

type responseRecorder struct {
    http.ResponseWriter
    statusCode int
    size       int
}

func (rr *responseRecorder) WriteHeader(code int) {
    rr.statusCode = code
    rr.ResponseWriter.WriteHeader(code)
}

func (rr *responseRecorder) Write(b []byte) (int, error) {
    size, err := rr.ResponseWriter.Write(b)
    rr.size += size
    return size, err
}

func MetricsMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        
        metrics.ActiveConnections.Inc()
        defer metrics.ActiveConnections.Dec()
        
        recorder := &responseRecorder{
            ResponseWriter: w,
            statusCode:    http.StatusOK,
        }
        
        next.ServeHTTP(recorder, r)
        
        duration := time.Since(start).Seconds()
        status := strconv.Itoa(recorder.statusCode)
        
        metrics.RequestsTotal.WithLabelValues(r.Method, r.URL.Path, status).Inc()
        metrics.RequestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
        metrics.ResponseSize.WithLabelValues(r.Method, r.URL.Path).Observe(float64(recorder.size))
    })
}

Métricas de Negócio

var (
    // Orders
    OrdersCreated = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_created_total",
            Help: "Total de pedidos criados",
        },
        []string{"country", "payment_method"},
    )
    
    OrderValue = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "order_value_usd",
            Help:    "Valor dos pedidos em USD",
            Buckets: []float64{10, 50, 100, 500, 1000, 5000},
        },
        []string{"country"},
    )
    
    // Users
    UserRegistrations = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "user_registrations_total",
            Help: "Total de registros de usuários",
        },
        []string{"source"},
    )
    
    // Payment
    PaymentFailures = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "payment_failures_total",
            Help: "Total de falhas de pagamento",
        },
        []string{"gateway", "error_type"},
    )
)

// Uso nas handlers
func CreateOrderHandler(w http.ResponseWriter, r *http.Request) {
    // ... processamento ...
    
    metrics.OrdersCreated.WithLabelValues(order.Country, order.PaymentMethod).Inc()
    metrics.OrderValue.WithLabelValues(order.Country).Observe(order.Total)
    
    // ...
}

Endpoint Prometheus

package main

import (
    "net/http"
    
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

func main() {
    // Endpoint para Prometheus scrape
    http.Handle("/metrics", promhttp.Handler())
    
    // Sua aplicação
    http.Handle("/api/", MetricsMiddleware(apiHandler))
    
    http.ListenAndServe(":8080", nil)
}

Configuração Prometheus (prometheus.yml)

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'go-app'
    static_configs:
      - targets: ['localhost:8080']
    metrics_path: /metrics
    scrape_interval: 10s
    
  - job_name: 'go-app-staging'
    static_configs:
      - targets: ['staging.example.com:8080']
    metrics_path: /metrics

3. Distributed Tracing

Traces mostram o fluxo completo de uma requisição através de múltiplos serviços.

OpenTelemetry Setup

package telemetry

import (
    "context"
    "time"
    
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/sdk/resource"
    sdktrace "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
    "go.opentelemetry.io/otel/trace"
)

var Tracer trace.Tracer

func InitTracer(serviceName, serviceVersion string) (*sdktrace.TracerProvider, error) {
    ctx := context.Background()
    
    // Exporter OTLP (envia para Jaeger, Zipkin, ou collector)
    exporter, err := otlptracegrpc.New(ctx,
        otlptracegrpc.WithEndpoint("localhost:4317"),
        otlptracegrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }
    
    // Resource com informações do serviço
    res, err := resource.New(ctx,
        resource.WithAttributes(
            semconv.ServiceName(serviceName),
            semconv.ServiceVersion(serviceVersion),
            semconv.DeploymentEnvironment("production"),
        ),
    )
    if err != nil {
        return nil, err
    }
    
    // Tracer Provider
    tp := sdktrace.NewTracerProvider(
        sdktrace.WithBatcher(exporter),
        sdktrace.WithResource(res),
        sdktrace.WithSampler(sdktrace.TraceIDRatioBased(0.1)), // 10% sampling
    )
    
    otel.SetTracerProvider(tp)
    Tracer = tp.Tracer(serviceName)
    
    return tp, nil
}

Instrumentação de Handlers

package middleware

import (
    "net/http"
    
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
    "myapp/telemetry"
)

func TracingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        ctx, span := telemetry.Tracer.Start(r.Context(), "HTTP "+r.Method+" "+r.URL.Path,
            trace.WithAttributes(
                attribute.String("http.method", r.Method),
                attribute.String("http.url", r.URL.String()),
                attribute.String("http.target", r.URL.Path),
                attribute.String("http.host", r.Host),
                attribute.String("http.user_agent", r.UserAgent()),
            ),
        )
        defer span.End()
        
        // Passar contexto com trace para handlers
        next.ServeHTTP(w, r.WithContext(ctx))
    })
}

Instrumentação de Database

package database

import (
    "context"
    "database/sql"
    "time"
    
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
    "myapp/telemetry"
)

type TracedDB struct {
    *sql.DB
}

func (db *TracedDB) QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
    ctx, span := telemetry.Tracer.Start(ctx, "database.query",
        trace.WithAttributes(
            attribute.String("db.system", "postgresql"),
            attribute.String("db.statement", query),
        ),
    )
    defer span.End()
    
    start := time.Now()
    rows, err := db.DB.QueryContext(ctx, query, args...)
    
    span.SetAttributes(
        attribute.Int64("db.duration_ms", time.Since(start).Milliseconds()),
    )
    
    if err != nil {
        span.RecordError(err)
    }
    
    return rows, err
}

// Método similar para ExecContext, PrepareContext, etc.

Instrumentação de Chamadas HTTP

package client

import (
    "context"
    "net/http"
    "time"
    
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/propagation"
    "go.opentelemetry.io/otel/trace"
    "myapp/telemetry"
)

type TracedClient struct {
    client http.Client
    propagator propagation.TraceContext
}

func (c *TracedClient) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
    ctx, span := telemetry.Tracer.Start(ctx, "HTTP Client "+req.Method,
        trace.WithAttributes(
            attribute.String("http.method", req.Method),
            attribute.String("http.url", req.URL.String()),
            attribute.String("peer.service", req.Host),
        ),
    )
    defer span.End()
    
    // Propagar contexto de trace para o serviço downstream
    c.propagator.Inject(ctx, propagation.HeaderCarrier(req.Header))
    
    start := time.Now()
    resp, err := c.client.Do(req)
    duration := time.Since(start)
    
    if err != nil {
        span.RecordError(err)
        return nil, err
    }
    
    span.SetAttributes(
        attribute.Int("http.status_code", resp.StatusCode),
        attribute.Int64("http.duration_ms", duration.Milliseconds()),
    )
    
    return resp, nil
}

Spans Personalizados

func ProcessPayment(ctx context.Context, payment Payment) error {
    ctx, span := telemetry.Tracer.Start(ctx, "process-payment",
        trace.WithAttributes(
            attribute.String("payment.id", payment.ID),
            attribute.String("payment.method", payment.Method),
            attribute.Float64("payment.amount", payment.Amount),
        ),
    )
    defer span.End()
    
    // Validar pagamento
    ctx, validateSpan := telemetry.Tracer.Start(ctx, "validate-payment")
    if err := validatePayment(payment); err != nil {
        validateSpan.RecordError(err)
        validateSpan.End()
        return err
    }
    validateSpan.End()
    
    // Processar com gateway
    ctx, gatewaySpan := telemetry.Tracer.Start(ctx, "gateway-charge",
        trace.WithAttributes(
            attribute.String("gateway", "stripe"),
        ),
    )
    result, err := gateway.Charge(ctx, payment)
    gatewaySpan.End()
    
    if err != nil {
        span.RecordError(err)
        span.SetAttributes(attribute.Bool("payment.success", false))
        return err
    }
    
    span.SetAttributes(
        attribute.Bool("payment.success", true),
        attribute.String("payment.transaction_id", result.TransactionID),
    )
    
    return nil
}

4. OpenTelemetry Collector

Para produção, use o Collector para receber, processar e exportar telemetria.

Configuração (otel-collector.yml)

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch:
    timeout: 1s
    send_batch_size: 1024
  
  resource:
    attributes:
      - key: environment
        value: production
        action: upsert

exporters:
  prometheus:
    endpoint: "0.0.0.0:8889"
  
  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true
  
  logging:
    loglevel: debug

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch, resource]
      exporters: [jaeger, logging]
    
    metrics:
      receivers: [otlp]
      processors: [batch]
      exporters: [prometheus, logging]

5. Alerting

Configure alertas baseados em métricas para detectar problemas rapidamente.

Regras de Alerta Prometheus

# alert-rules.yml
groups:
  - name: go-app-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status=~"5.."}[5m]))
            /
            sum(rate(http_requests_total[5m]))
          ) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Taxa de erro alta detectada"
          description: "Taxa de erro {{ $value | humanizePercentage }} nos últimos 5 minutos"
      
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, 
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 0.5
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "Latência P95 acima de 500ms"
          description: "Latência P95 está em {{ $value }}s"
      
      - alert: ServiceDown
        expr: up{job="go-app"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Serviço {{ $labels.instance }} está fora do ar"
      
      - alert: HighMemoryUsage
        expr: |
          (process_resident_memory_bytes / 
           (node_memory_MemTotal_bytes or node_memory_MemTotal))
          > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Uso de memória alto"
          description: "Uso de memória acima de 90%"

AlertManager Configuração

# alertmanager.yml
global:
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: '${SMTP_PASSWORD}'

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty'
      continue: true
    - match:
        severity: warning
      receiver: 'slack'

receivers:
  - name: 'default'
    email_configs:
      - to: 'team@example.com'
        subject: 'Alerta: {{ .GroupLabels.alertname }}'
  
  - name: 'slack'
    slack_configs:
      - api_url: '${SLACK_WEBHOOK_URL}'
        channel: '#alerts'
        title: 'Alerta: {{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
  
  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: '${PAGERDUTY_KEY}'
        severity: critical

Health Checks

package health

import (
    "context"
    "net/http"
    "time"
)

type HealthChecker struct {
    checks map[string]CheckFunc
}

type CheckFunc func(ctx context.Context) error

func (h *HealthChecker) Register(name string, check CheckFunc) {
    h.checks[name] = check
}

func (h *HealthChecker) Handler(w http.ResponseWriter, r *http.Request) {
    ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
    defer cancel()
    
    results := make(map[string]string)
    status := http.StatusOK
    
    for name, check := range h.checks {
        if err := check(ctx); err != nil {
            results[name] = "fail: " + err.Error()
            status = http.StatusServiceUnavailable
        } else {
            results[name] = "ok"
        }
    }
    
    w.Header().Set("Content-Type", "application/json")
    w.WriteHeader(status)
    json.NewEncoder(w).Encode(results)
}

// Uso
health := &HealthChecker{checks: make(map[string]CheckFunc)}

health.Register("database", func(ctx context.Context) error {
    return db.PingContext(ctx)
})

health.Register("cache", func(ctx context.Context) error {
    return redis.Ping(ctx).Err()
})

http.Handle("/health", http.HandlerFunc(health.Handler))

6. Exemplo Real: API Completa

package main

import (
    "context"
    "log"
    "log/slog"
    "net/http"
    "os"
    "os/signal"
    "syscall"
    "time"
    
    "github.com/prometheus/client_golang/prometheus/promhttp"
    "myapp/middleware"
    "myapp/telemetry"
)

func main() {
    // Logger
    logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
    slog.SetDefault(logger)
    
    // Tracer
    tp, err := telemetry.InitTracer("order-service", "1.0.0")
    if err != nil {
        log.Fatal(err)
    }
    defer tp.Shutdown(context.Background())
    
    // Router com middlewares de observabilidade
    mux := http.NewServeMux()
    
    // Middleware chain
    handler := middleware.TracingMiddleware(
        middleware.MetricsMiddleware(
            middleware.LoggerMiddleware(
                http.HandlerFunc(orderHandler),
            ),
        ),
    )
    
    mux.Handle("/api/orders", handler)
    mux.Handle("/metrics", promhttp.Handler())
    mux.Handle("/health", http.HandlerFunc(healthHandler))
    
    server := &http.Server{
        Addr:         ":8080",
        Handler:      mux,
        ReadTimeout:  5 * time.Second,
        WriteTimeout: 10 * time.Second,
    }
    
    // Graceful shutdown
    go func() {
        sigChan := make(chan os.Signal, 1)
        signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
        <-sigChan
        
        shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
        defer cancel()
        
        server.Shutdown(shutdownCtx)
    }()
    
    logger.Info("server starting", slog.String("addr", server.Addr))
    if err := server.ListenAndServe(); err != http.ErrServerClosed {
        logger.Error("server error", slog.String("error", err.Error()))
    }
}

func orderHandler(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()
    logger := slog.Default()
    
    var req OrderRequest
    if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
        http.Error(w, err.Error(), http.StatusBadRequest)
        return
    }
    
    logger.InfoContext(ctx, "creating order",
        slog.String("user_id", req.UserID),
        slog.Float64("amount", req.Amount),
    )
    
    // Processar pedido com trace
    order, err := processOrder(ctx, req)
    if err != nil {
        logger.ErrorContext(ctx, "order failed", slog.String("error", err.Error()))
        http.Error(w, err.Error(), http.StatusInternalServerError)
        return
    }
    
    // Métricas de negócio
    metrics.OrdersCreated.WithLabelValues("BR", req.PaymentMethod).Inc()
    metrics.OrderValue.WithLabelValues("BR").Observe(req.Amount)
    
    w.Header().Set("Content-Type", "application/json")
    json.NewEncoder(w).Encode(order)
}

Dashboards Grafana

Queries Úteis para Prometheus

# Taxa de requisições por segundo
sum(rate(http_requests_total[5m]))

# Taxa de erro (%) por endpoint
sum(rate(http_requests_total{status=~"5.."}[5m])) by (path)
/
sum(rate(http_requests_total[5m])) by (path)

# Latência P95
histogram_quantile(0.95, 
  sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
)

# Apdex score
(
  sum(rate(http_request_duration_seconds_bucket{le="0.3"}[5m])) +
  sum(rate(http_request_duration_seconds_bucket{le="1.2"}[5m])) / 2
)
/
sum(rate(http_request_duration_seconds_count[5m]))

Próximos Passos


Observabilidade completa: logs para debug, métricas para dashboards, traces para entender fluxos complexos. Implemente os três pilares!