Observabilidade é essencial para aplicações em produção. Este guia completo mostra como implementar logging estruturado, métricas, traces e alerting em Go usando as melhores práticas e ferramentas modernas.
Pilares da Observabilidade
┌─────────────────────────────────────────────────────────┐
│ OBSERVABILITY │
├─────────────┬─────────────┬─────────────────────────────┤
│ LOGS │ METRICS │ TRACES │
│ │ │ │
│ O que │ Quanto/ │ Onde/O │
│ aconteceu │ Quando │ quando │
│ │ │ │
│ Eventos │ Números │ Request flow │
│ detalhados │ agregados │ distribuído │
└─────────────┴─────────────┴─────────────────────────────┘
1. Structured Logging
Logging estruturado é fundamental para debug e análise de incidentes.
slog (Biblioteca Padrão - Go 1.21+)
package main
import (
"log/slog"
"os"
"time"
)
func main() {
// Logger JSON para produção
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
// Logger texto para desenvolvimento
devLogger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelDebug,
}))
// Logging estruturado
logger.Info("request completed",
slog.String("method", "GET"),
slog.String("path", "/api/users"),
slog.Int("status", 200),
slog.Duration("duration", 150*time.Millisecond),
slog.String("request_id", "abc-123"),
)
}
Output JSON (Produção)
{
"time": "2026-02-11T14:30:45Z",
"level": "INFO",
"msg": "request completed",
"method": "GET",
"path": "/api/users",
"status": 200,
"duration": 150000000,
"request_id": "abc-123"
}
Logger Customizado com Contexto
type contextKey string
const loggerKey contextKey = "logger"
// Logger com campos padrão
type Logger struct {
*slog.Logger
service string
version string
}
func NewLogger(service, version string) *Logger {
return &Logger{
Logger: slog.New(slog.NewJSONHandler(os.Stdout, nil)),
service: service,
version: version,
}
}
func (l *Logger) WithRequestID(requestID string) *slog.Logger {
return l.With(
slog.String("service", l.service),
slog.String("version", l.version),
slog.String("request_id", requestID),
)
}
// Middleware para injetar logger no contexto
func LoggerMiddleware(logger *Logger) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestID := generateRequestID()
ctx := context.WithValue(r.Context(), loggerKey,
logger.WithRequestID(requestID))
w.Header().Set("X-Request-ID", requestID)
next.ServeHTTP(w, r.WithContext(ctx))
})
}
}
// Recuperar logger do contexto
func LoggerFromContext(ctx context.Context) *slog.Logger {
if logger, ok := ctx.Value(loggerKey).(*slog.Logger); ok {
return logger
}
return slog.Default()
}
Níveis de Log Apropriados
// DEBUG - Informação detalhada para desenvolvimento
logger.Debug("database query",
slog.String("sql", "SELECT * FROM users"),
slog.Any("params", params),
)
// INFO - Eventos normais da aplicação
logger.Info("user created",
slog.String("user_id", user.ID),
slog.String("email", user.Email),
)
// WARN - Situações anômalas mas recuperáveis
logger.Warn("slow query detected",
slog.Duration("duration", 5*time.Second),
slog.String("query", query),
)
// ERROR - Erros que precisam de atenção
logger.Error("payment processing failed",
slog.String("order_id", orderID),
slog.String("error", err.Error()),
slog.String("user_id", userID),
)
2. Métricas com Prometheus
Métricas permitem monitorar a saúde e performance da aplicação.
Tipos de Métricas
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// Counter - valores que só aumentam
RequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total de requisições HTTP",
},
[]string{"method", "path", "status"},
)
// Gauge - valores que podem subir ou descer
ActiveConnections = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Conexões ativas no momento",
},
)
// Histogram - distribuição de valores (latência)
RequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "Duração das requisições HTTP",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "path"},
)
// Summary - similar ao histogram com quantis configuráveis
ResponseSize = promauto.NewSummaryVec(
prometheus.SummaryOpts{
Name: "http_response_size_bytes",
Help: "Tamanho das respostas HTTP",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"method", "path"},
)
)
Middleware de Métricas HTTP
package middleware
import (
"net/http"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"myapp/metrics"
)
type responseRecorder struct {
http.ResponseWriter
statusCode int
size int
}
func (rr *responseRecorder) WriteHeader(code int) {
rr.statusCode = code
rr.ResponseWriter.WriteHeader(code)
}
func (rr *responseRecorder) Write(b []byte) (int, error) {
size, err := rr.ResponseWriter.Write(b)
rr.size += size
return size, err
}
func MetricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
metrics.ActiveConnections.Inc()
defer metrics.ActiveConnections.Dec()
recorder := &responseRecorder{
ResponseWriter: w,
statusCode: http.StatusOK,
}
next.ServeHTTP(recorder, r)
duration := time.Since(start).Seconds()
status := strconv.Itoa(recorder.statusCode)
metrics.RequestsTotal.WithLabelValues(r.Method, r.URL.Path, status).Inc()
metrics.RequestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
metrics.ResponseSize.WithLabelValues(r.Method, r.URL.Path).Observe(float64(recorder.size))
})
}
Métricas de Negócio
var (
// Orders
OrdersCreated = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "orders_created_total",
Help: "Total de pedidos criados",
},
[]string{"country", "payment_method"},
)
OrderValue = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "order_value_usd",
Help: "Valor dos pedidos em USD",
Buckets: []float64{10, 50, 100, 500, 1000, 5000},
},
[]string{"country"},
)
// Users
UserRegistrations = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "user_registrations_total",
Help: "Total de registros de usuários",
},
[]string{"source"},
)
// Payment
PaymentFailures = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "payment_failures_total",
Help: "Total de falhas de pagamento",
},
[]string{"gateway", "error_type"},
)
)
// Uso nas handlers
func CreateOrderHandler(w http.ResponseWriter, r *http.Request) {
// ... processamento ...
metrics.OrdersCreated.WithLabelValues(order.Country, order.PaymentMethod).Inc()
metrics.OrderValue.WithLabelValues(order.Country).Observe(order.Total)
// ...
}
Endpoint Prometheus
package main
import (
"net/http"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
func main() {
// Endpoint para Prometheus scrape
http.Handle("/metrics", promhttp.Handler())
// Sua aplicação
http.Handle("/api/", MetricsMiddleware(apiHandler))
http.ListenAndServe(":8080", nil)
}
Configuração Prometheus (prometheus.yml)
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'go-app'
static_configs:
- targets: ['localhost:8080']
metrics_path: /metrics
scrape_interval: 10s
- job_name: 'go-app-staging'
static_configs:
- targets: ['staging.example.com:8080']
metrics_path: /metrics
3. Distributed Tracing
Traces mostram o fluxo completo de uma requisição através de múltiplos serviços.
OpenTelemetry Setup
package telemetry
import (
"context"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
"go.opentelemetry.io/otel/trace"
)
var Tracer trace.Tracer
func InitTracer(serviceName, serviceVersion string) (*sdktrace.TracerProvider, error) {
ctx := context.Background()
// Exporter OTLP (envia para Jaeger, Zipkin, ou collector)
exporter, err := otlptracegrpc.New(ctx,
otlptracegrpc.WithEndpoint("localhost:4317"),
otlptracegrpc.WithInsecure(),
)
if err != nil {
return nil, err
}
// Resource com informações do serviço
res, err := resource.New(ctx,
resource.WithAttributes(
semconv.ServiceName(serviceName),
semconv.ServiceVersion(serviceVersion),
semconv.DeploymentEnvironment("production"),
),
)
if err != nil {
return nil, err
}
// Tracer Provider
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(res),
sdktrace.WithSampler(sdktrace.TraceIDRatioBased(0.1)), // 10% sampling
)
otel.SetTracerProvider(tp)
Tracer = tp.Tracer(serviceName)
return tp, nil
}
Instrumentação de Handlers
package middleware
import (
"net/http"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"myapp/telemetry"
)
func TracingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ctx, span := telemetry.Tracer.Start(r.Context(), "HTTP "+r.Method+" "+r.URL.Path,
trace.WithAttributes(
attribute.String("http.method", r.Method),
attribute.String("http.url", r.URL.String()),
attribute.String("http.target", r.URL.Path),
attribute.String("http.host", r.Host),
attribute.String("http.user_agent", r.UserAgent()),
),
)
defer span.End()
// Passar contexto com trace para handlers
next.ServeHTTP(w, r.WithContext(ctx))
})
}
Instrumentação de Database
package database
import (
"context"
"database/sql"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"myapp/telemetry"
)
type TracedDB struct {
*sql.DB
}
func (db *TracedDB) QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
ctx, span := telemetry.Tracer.Start(ctx, "database.query",
trace.WithAttributes(
attribute.String("db.system", "postgresql"),
attribute.String("db.statement", query),
),
)
defer span.End()
start := time.Now()
rows, err := db.DB.QueryContext(ctx, query, args...)
span.SetAttributes(
attribute.Int64("db.duration_ms", time.Since(start).Milliseconds()),
)
if err != nil {
span.RecordError(err)
}
return rows, err
}
// Método similar para ExecContext, PrepareContext, etc.
Instrumentação de Chamadas HTTP
package client
import (
"context"
"net/http"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/trace"
"myapp/telemetry"
)
type TracedClient struct {
client http.Client
propagator propagation.TraceContext
}
func (c *TracedClient) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
ctx, span := telemetry.Tracer.Start(ctx, "HTTP Client "+req.Method,
trace.WithAttributes(
attribute.String("http.method", req.Method),
attribute.String("http.url", req.URL.String()),
attribute.String("peer.service", req.Host),
),
)
defer span.End()
// Propagar contexto de trace para o serviço downstream
c.propagator.Inject(ctx, propagation.HeaderCarrier(req.Header))
start := time.Now()
resp, err := c.client.Do(req)
duration := time.Since(start)
if err != nil {
span.RecordError(err)
return nil, err
}
span.SetAttributes(
attribute.Int("http.status_code", resp.StatusCode),
attribute.Int64("http.duration_ms", duration.Milliseconds()),
)
return resp, nil
}
Spans Personalizados
func ProcessPayment(ctx context.Context, payment Payment) error {
ctx, span := telemetry.Tracer.Start(ctx, "process-payment",
trace.WithAttributes(
attribute.String("payment.id", payment.ID),
attribute.String("payment.method", payment.Method),
attribute.Float64("payment.amount", payment.Amount),
),
)
defer span.End()
// Validar pagamento
ctx, validateSpan := telemetry.Tracer.Start(ctx, "validate-payment")
if err := validatePayment(payment); err != nil {
validateSpan.RecordError(err)
validateSpan.End()
return err
}
validateSpan.End()
// Processar com gateway
ctx, gatewaySpan := telemetry.Tracer.Start(ctx, "gateway-charge",
trace.WithAttributes(
attribute.String("gateway", "stripe"),
),
)
result, err := gateway.Charge(ctx, payment)
gatewaySpan.End()
if err != nil {
span.RecordError(err)
span.SetAttributes(attribute.Bool("payment.success", false))
return err
}
span.SetAttributes(
attribute.Bool("payment.success", true),
attribute.String("payment.transaction_id", result.TransactionID),
)
return nil
}
4. OpenTelemetry Collector
Para produção, use o Collector para receber, processar e exportar telemetria.
Configuração (otel-collector.yml)
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
resource:
attributes:
- key: environment
value: production
action: upsert
exporters:
prometheus:
endpoint: "0.0.0.0:8889"
jaeger:
endpoint: jaeger:14250
tls:
insecure: true
logging:
loglevel: debug
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch, resource]
exporters: [jaeger, logging]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheus, logging]
5. Alerting
Configure alertas baseados em métricas para detectar problemas rapidamente.
Regras de Alerta Prometheus
# alert-rules.yml
groups:
- name: go-app-alerts
rules:
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "Taxa de erro alta detectada"
description: "Taxa de erro {{ $value | humanizePercentage }} nos últimos 5 minutos"
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 0.5
for: 3m
labels:
severity: warning
annotations:
summary: "Latência P95 acima de 500ms"
description: "Latência P95 está em {{ $value }}s"
- alert: ServiceDown
expr: up{job="go-app"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Serviço {{ $labels.instance }} está fora do ar"
- alert: HighMemoryUsage
expr: |
(process_resident_memory_bytes /
(node_memory_MemTotal_bytes or node_memory_MemTotal))
> 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Uso de memória alto"
description: "Uso de memória acima de 90%"
AlertManager Configuração
# alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: '${SMTP_PASSWORD}'
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'pagerduty'
continue: true
- match:
severity: warning
receiver: 'slack'
receivers:
- name: 'default'
email_configs:
- to: 'team@example.com'
subject: 'Alerta: {{ .GroupLabels.alertname }}'
- name: 'slack'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts'
title: 'Alerta: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: '${PAGERDUTY_KEY}'
severity: critical
Health Checks
package health
import (
"context"
"net/http"
"time"
)
type HealthChecker struct {
checks map[string]CheckFunc
}
type CheckFunc func(ctx context.Context) error
func (h *HealthChecker) Register(name string, check CheckFunc) {
h.checks[name] = check
}
func (h *HealthChecker) Handler(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
results := make(map[string]string)
status := http.StatusOK
for name, check := range h.checks {
if err := check(ctx); err != nil {
results[name] = "fail: " + err.Error()
status = http.StatusServiceUnavailable
} else {
results[name] = "ok"
}
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(results)
}
// Uso
health := &HealthChecker{checks: make(map[string]CheckFunc)}
health.Register("database", func(ctx context.Context) error {
return db.PingContext(ctx)
})
health.Register("cache", func(ctx context.Context) error {
return redis.Ping(ctx).Err()
})
http.Handle("/health", http.HandlerFunc(health.Handler))
6. Exemplo Real: API Completa
package main
import (
"context"
"log"
"log/slog"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/prometheus/client_golang/prometheus/promhttp"
"myapp/middleware"
"myapp/telemetry"
)
func main() {
// Logger
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
slog.SetDefault(logger)
// Tracer
tp, err := telemetry.InitTracer("order-service", "1.0.0")
if err != nil {
log.Fatal(err)
}
defer tp.Shutdown(context.Background())
// Router com middlewares de observabilidade
mux := http.NewServeMux()
// Middleware chain
handler := middleware.TracingMiddleware(
middleware.MetricsMiddleware(
middleware.LoggerMiddleware(
http.HandlerFunc(orderHandler),
),
),
)
mux.Handle("/api/orders", handler)
mux.Handle("/metrics", promhttp.Handler())
mux.Handle("/health", http.HandlerFunc(healthHandler))
server := &http.Server{
Addr: ":8080",
Handler: mux,
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
}
// Graceful shutdown
go func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
<-sigChan
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
server.Shutdown(shutdownCtx)
}()
logger.Info("server starting", slog.String("addr", server.Addr))
if err := server.ListenAndServe(); err != http.ErrServerClosed {
logger.Error("server error", slog.String("error", err.Error()))
}
}
func orderHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
logger := slog.Default()
var req OrderRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
logger.InfoContext(ctx, "creating order",
slog.String("user_id", req.UserID),
slog.Float64("amount", req.Amount),
)
// Processar pedido com trace
order, err := processOrder(ctx, req)
if err != nil {
logger.ErrorContext(ctx, "order failed", slog.String("error", err.Error()))
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Métricas de negócio
metrics.OrdersCreated.WithLabelValues("BR", req.PaymentMethod).Inc()
metrics.OrderValue.WithLabelValues("BR").Observe(req.Amount)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(order)
}
Dashboards Grafana
Queries Úteis para Prometheus
# Taxa de requisições por segundo
sum(rate(http_requests_total[5m]))
# Taxa de erro (%) por endpoint
sum(rate(http_requests_total{status=~"5.."}[5m])) by (path)
/
sum(rate(http_requests_total[5m])) by (path)
# Latência P95
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
)
# Apdex score
(
sum(rate(http_request_duration_seconds_bucket{le="0.3"}[5m])) +
sum(rate(http_request_duration_seconds_bucket{le="1.2"}[5m])) / 2
)
/
sum(rate(http_request_duration_seconds_count[5m]))
Próximos Passos
- Go Performance Profiling — Otimize baseado em métricas
- Go e Docker — Containerize com health checks
- OpenTelemetry Docs — Instrumentação avançada
Observabilidade completa: logs para debug, métricas para dashboards, traces para entender fluxos complexos. Implemente os três pilares!