From 085563ac2f92074cc20a50dedde2b834438f0475 Mon Sep 17 00:00:00 2001 From: Jeff Becker Date: Tue, 4 Sep 2018 08:41:25 -0400 Subject: [PATCH] implement utp link protocol gut curvecp for now --- CMakeLists.txt | 25 +- include/llarp/address_info.hpp | 15 + include/llarp/aligned.hpp | 16 +- include/llarp/link/server.hpp | 34 +- include/llarp/link/session.hpp | 8 +- include/llarp/link/utp.hpp | 15 + include/llarp/messages/link_intro.hpp | 6 +- include/llarp/net.hpp | 18 + include/utp.h | 227 ++ include/utp_types.h | 123 + libutp/LICENSE | 19 + libutp/Makefile | 48 + libutp/README.md | 68 + libutp/libutp_inet_ntop.cpp | 108 + libutp/libutp_inet_ntop.h | 68 + libutp/parse_log.py | 288 ++ libutp/utp_api.cpp | 139 + libutp/utp_callbacks.cpp | 208 + libutp/utp_callbacks.h | 47 + libutp/utp_hash.cpp | 246 ++ libutp/utp_hash.h | 146 + libutp/utp_internal.cpp | 3489 +++++++++++++++++ libutp/utp_internal.h | 141 + libutp/utp_packedsockaddr.cpp | 139 + libutp/utp_packedsockaddr.h | 60 + libutp/utp_templates.h | 195 + libutp/utp_utils.cpp | 254 ++ libutp/utp_utils.h | 27 + llarp/address_info.cpp | 15 + llarp/curvecp/client.cpp | 0 llarp/curvecp/impl.cpp | 32 - .../{curvecp/server.cpp => link/curvecp.cpp} | 1 + llarp/link/encoder.hpp | 3 +- llarp/link/server.cpp | 29 +- llarp/link/utp.cpp | 576 +++ llarp/link_intro.cpp | 9 +- llarp/net.cpp | 30 +- 37 files changed, 6782 insertions(+), 90 deletions(-) create mode 100644 include/llarp/link/utp.hpp create mode 100644 include/utp.h create mode 100644 include/utp_types.h create mode 100644 libutp/LICENSE create mode 100644 libutp/Makefile create mode 100644 libutp/README.md create mode 100644 libutp/libutp_inet_ntop.cpp create mode 100644 libutp/libutp_inet_ntop.h create mode 100644 libutp/parse_log.py create mode 100644 libutp/utp_api.cpp create mode 100644 libutp/utp_callbacks.cpp create mode 100644 libutp/utp_callbacks.h create mode 100644 libutp/utp_hash.cpp create mode 100644 libutp/utp_hash.h create mode 100644 libutp/utp_internal.cpp create mode 100644 libutp/utp_internal.h create mode 100644 libutp/utp_packedsockaddr.cpp create mode 100644 libutp/utp_packedsockaddr.h create mode 100644 libutp/utp_templates.h create mode 100644 libutp/utp_utils.cpp create mode 100644 libutp/utp_utils.h delete mode 100644 llarp/curvecp/client.cpp delete mode 100644 llarp/curvecp/impl.cpp rename llarp/{curvecp/server.cpp => link/curvecp.cpp} (85%) create mode 100644 llarp/link/utp.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d8376763..c27f5e2b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,7 +155,8 @@ set(TT_ROOT vendor/libtuntap-master) add_definitions(-D${CMAKE_SYSTEM_NAME}) if (UNIX) -add_definitions(-DUNIX) + add_definitions(-DUNIX) + add_definitions(-DPOSIX) endif() if(UNIX) @@ -305,7 +306,24 @@ set(NTRU_SRC crypto/libntrup/src/ntru.cpp ) -set(LIB_SRC + +set(UTP_SRC + libutp/utp_callbacks.cpp + libutp/utp_utils.cpp + libutp/utp_internal.cpp + libutp/utp_api.cpp + libutp/utp_packedsockaddr.cpp + libutp/utp_hash.cpp +) + + +if(WIN32) + set(UTP_SRC ${UTP_SRC} libutp/libutp_inet_ntop.cpp) +endif() + + +set(LIB_SRC + ${UTP_SRC} ${NTRU_SRC} llarp/address_info.cpp llarp/bencode.cpp @@ -338,7 +356,6 @@ set(LIB_SRC llarp/service.cpp llarp/transit_hop.cpp llarp/testnet.c - llarp/curvecp/impl.cpp llarp/dht/context.cpp llarp/dht/decode.cpp llarp/dht/dht_immediate.cpp @@ -348,8 +365,10 @@ set(LIB_SRC llarp/dht/got_router.cpp llarp/dht/publish_intro.cpp llarp/handlers/tun.cpp + llarp/link/curvecp.cpp llarp/link/encoder.cpp llarp/link/server.cpp + llarp/link/utp.cpp llarp/routing/dht_message.cpp llarp/routing/message_parser.cpp llarp/routing/path_confirm.cpp diff --git a/include/llarp/address_info.hpp b/include/llarp/address_info.hpp index 5294e0c5a..7aa7a3b50 100644 --- a/include/llarp/address_info.hpp +++ b/include/llarp/address_info.hpp @@ -30,6 +30,12 @@ namespace llarp AddressInfo& operator=(const AddressInfo& other); + bool + operator==(const AddressInfo& other) const; + + bool + operator<(const AddressInfo& other) const; + bool BEncode(llarp_buffer_t* buf) const; @@ -43,6 +49,15 @@ namespace llarp inet_ntop(AF_INET6, &a.ip, tmp, sizeof(tmp)); return out << tmp << "." << std::to_string(a.port); } + + struct Hash + { + size_t + operator()(const AddressInfo& addr) const + { + return *addr.pubkey.data_l(); + } + }; }; } // namespace llarp diff --git a/include/llarp/aligned.hpp b/include/llarp/aligned.hpp index a1eede6fc..1741cb714 100644 --- a/include/llarp/aligned.hpp +++ b/include/llarp/aligned.hpp @@ -11,8 +11,8 @@ namespace llarp { - /// aligned buffer, aligns to the nears 8 bytes - template < size_t sz, bool randomize = false > + /// aligned buffer, aligns to the nears Long_t + template < size_t sz, bool randomize = false, typename Long_t = uint64_t > struct AlignedBuffer { AlignedBuffer() @@ -84,7 +84,7 @@ namespace llarp operator^(const AlignedBuffer& other) const { AlignedBuffer< sz > ret; - for(size_t idx = 0; idx < sz / 8; ++idx) + for(size_t idx = 0; idx < sz / sizeof(Long_t); ++idx) ret.l[idx] = l[idx] ^ other.l[idx]; return ret; } @@ -92,7 +92,7 @@ namespace llarp AlignedBuffer& operator^=(const AlignedBuffer& other) { - for(size_t idx = 0; idx < sz / 8; ++idx) + for(size_t idx = 0; idx < sz / sizeof(Long_t); ++idx) l[idx] ^= other.l[idx]; return *this; } @@ -146,13 +146,13 @@ namespace llarp return &b[0]; } - uint64_t* + Long_t* data_l() { return &l[0]; } - const uint64_t* + const Long_t* data_l() const { return &l[0]; @@ -194,7 +194,7 @@ namespace llarp struct Hash { size_t - operator()(const AlignedBuffer< sz >& buf) const + operator()(const AlignedBuffer& buf) const { return *buf.data_l(); } @@ -203,7 +203,7 @@ namespace llarp protected: union { byte_t b[sz]; - uint64_t l[(sz / 8) + (sz % 8)]; + Long_t l[(sz / sizeof(Long_t)) + (sz % sizeof(Long_t))]; }; }; diff --git a/include/llarp/link/server.hpp b/include/llarp/link/server.hpp index 43fe610e0..31ce2cbfd 100644 --- a/include/llarp/link/server.hpp +++ b/include/llarp/link/server.hpp @@ -42,19 +42,19 @@ namespace llarp uint16_t port); virtual ILinkSession* - NewInboundSession(const Addr& from) const = 0; + NewInboundSession(const Addr& from) = 0; virtual ILinkSession* - NewOutboundSession(const RouterContact& rc) const = 0; + NewOutboundSession(const RouterContact& rc, const AddressInfo& ai) = 0; - void + virtual void Pump(); - void - RecvFrom(const Addr& from, const void* buf, size_t sz); + virtual void + RecvFrom(const Addr& from, const void* buf, size_t sz) = 0; bool - PickAddress(const RouterContact& rc, llarp::Addr& picked) const; + PickAddress(const RouterContact& rc, AddressInfo& picked) const; void TryEstablishTo(const RouterContact& rc); @@ -78,17 +78,20 @@ namespace llarp SendTo(const PubKey& remote, llarp_buffer_t buf); bool - GetOurAddressInfo(llarp::AddressInfo& addr) const; + GetOurAddressInfo(AddressInfo& addr) const; virtual uint16_t Rank() const = 0; virtual bool - KeyGen(llarp::SecretKey&) = 0; + KeyGen(SecretKey&) = 0; const byte_t* TransportPubKey() const; + const byte_t* + TransportSecretKey() const; + bool EnsureKeys(const char* fpath); @@ -111,6 +114,21 @@ namespace llarp uint32_t tick_id; protected: + void + PutSession(const Addr& addr, ILinkSession* s) + { + util::Lock l(m_SessionsMutex); + m_Sessions.insert( + std::make_pair(addr, std::unique_ptr< ILinkSession >(s))); + } + + void + MapAddr(const Addr& addr, const PubKey& pk) + { + util::Lock l(m_LinksMutex); + m_Links.insert(std::make_pair(pk, addr)); + } + llarp_router* m_router; llarp_logic* m_Logic = nullptr; Addr m_ourAddr; diff --git a/include/llarp/link/session.hpp b/include/llarp/link/session.hpp index b6ccafeb0..62062c716 100644 --- a/include/llarp/link/session.hpp +++ b/include/llarp/link/session.hpp @@ -21,10 +21,6 @@ namespace llarp virtual void Tick(llarp_time_t now) = 0; - /// handle a link intro message - virtual bool - HandleLinkIntro(const LinkIntroMessage* msg) = 0; - /// send a message buffer to the remote endpoint virtual bool SendMessageBuffer(llarp_buffer_t buf) = 0; @@ -33,9 +29,9 @@ namespace llarp virtual bool Recv(const void* buf, size_t sz) = 0; - /// send handshake + /// start the connection virtual void - Handshake() = 0; + Start() = 0; /// send a keepalive to the remote endpoint virtual bool diff --git a/include/llarp/link/utp.hpp b/include/llarp/link/utp.hpp new file mode 100644 index 000000000..6824be0f4 --- /dev/null +++ b/include/llarp/link/utp.hpp @@ -0,0 +1,15 @@ +#ifndef LLARP_LINK_UTP_HPP +#define LLARP_LINK_UTP_HPP + +#include + +namespace llarp +{ + namespace utp + { + std::unique_ptr< ILinkLayer > + NewServer(llarp_router* r); + } +} // namespace llarp + +#endif diff --git a/include/llarp/messages/link_intro.hpp b/include/llarp/messages/link_intro.hpp index 1098618e2..427519201 100644 --- a/include/llarp/messages/link_intro.hpp +++ b/include/llarp/messages/link_intro.hpp @@ -8,16 +8,16 @@ namespace llarp struct LinkIntroMessage : public ILinkMessage { - LinkIntroMessage(ILinkSession* s) : ILinkMessage(), from(s) + LinkIntroMessage(ILinkSession* s) : ILinkMessage(s) { } ~LinkIntroMessage(); - ILinkSession* from; - RouterContact rc; + KeyExchangeNonce N; + bool DecodeKey(llarp_buffer_t key, llarp_buffer_t* buf); diff --git a/include/llarp/net.hpp b/include/llarp/net.hpp index 9860f601b..74a6a4bb4 100644 --- a/include/llarp/net.hpp +++ b/include/llarp/net.hpp @@ -9,12 +9,21 @@ bool operator==(const sockaddr& a, const sockaddr& b); +bool +operator==(const sockaddr_in& a, const sockaddr_in& b); + +bool +operator==(const sockaddr_in6& a, const sockaddr_in6& b); + bool operator<(const sockaddr_in6& a, const sockaddr_in6& b); bool operator<(const in6_addr& a, const in6_addr& b); +bool +operator==(const in6_addr& a, const in6_addr& b); + namespace llarp { struct Addr @@ -281,6 +290,15 @@ namespace llarp return !(*this == other); } + socklen_t + SockLen() const + { + if(af() == AF_INET) + return sizeof(sockaddr_in); + else + return sizeof(sockaddr_in6); + } + bool isPrivate() { diff --git a/include/utp.h b/include/utp.h new file mode 100644 index 000000000..402a92ec0 --- /dev/null +++ b/include/utp.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_H__ +#define __UTP_H__ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include +#include "utp_types.h" + + typedef struct UTPSocket utp_socket; + typedef struct struct_utp_context utp_context; + + enum + { + UTP_UDP_DONTFRAG = 2, // Used to be a #define as UDP_IP_DONTFRAG + }; + + enum + { + // socket has reveived syn-ack (notification only for outgoing connection + // completion) this implies writability + UTP_STATE_CONNECT = 1, + + // socket is able to send more data + UTP_STATE_WRITABLE = 2, + + // connection closed + UTP_STATE_EOF = 3, + + // socket is being destroyed, meaning all data has been sent if possible. + // it is not valid to refer to the socket after this state change occurs + UTP_STATE_DESTROYING = 4, + }; + + extern const char *utp_state_names[]; + + // Errors codes that can be passed to UTP_ON_ERROR callback + enum + { + UTP_ECONNREFUSED = 0, + UTP_ECONNRESET, + UTP_ETIMEDOUT, + }; + + extern const char *utp_error_code_names[]; + + enum + { + // callback names + UTP_ON_FIREWALL = 0, + UTP_ON_ACCEPT, + UTP_ON_CONNECT, + UTP_ON_ERROR, + UTP_ON_READ, + UTP_ON_OVERHEAD_STATISTICS, + UTP_ON_STATE_CHANGE, + UTP_GET_READ_BUFFER_SIZE, + UTP_ON_DELAY_SAMPLE, + UTP_GET_UDP_MTU, + UTP_GET_UDP_OVERHEAD, + UTP_GET_MILLISECONDS, + UTP_GET_MICROSECONDS, + UTP_GET_RANDOM, + UTP_LOG, + UTP_SENDTO, + + // context and socket options that may be set/queried + UTP_LOG_NORMAL, + UTP_LOG_MTU, + UTP_LOG_DEBUG, + UTP_SNDBUF, + UTP_RCVBUF, + UTP_TARGET_DELAY, + + UTP_ARRAY_SIZE, // must be last + }; + + extern const char *utp_callback_names[]; + + typedef struct + { + utp_context *context; + utp_socket *socket; + size_t len; + uint32 flags; + int callback_type; + const byte *buf; + + union { + const struct sockaddr *address; + int send; + int sample_ms; + int error_code; + int state; + }; + + union { + socklen_t address_len; + int type; + }; + } utp_callback_arguments; + + typedef uint64 + utp_callback_t(utp_callback_arguments *); + + // Returned by utp_get_context_stats() + typedef struct + { + uint32 _nraw_recv[5]; // total packets recieved less than 300/600/1200/MTU + // bytes fpr all connections (context-wide) + uint32 _nraw_send[5]; // total packets sent less than 300/600/1200/MTU + // bytes for all connections (context-wide) + } utp_context_stats; + + // Returned by utp_get_stats() + typedef struct + { + uint64 nbytes_recv; // total bytes received + uint64 nbytes_xmit; // total bytes transmitted + uint32 rexmit; // retransmit counter + uint32 fastrexmit; // fast retransmit counter + uint32 nxmit; // transmit counter + uint32 nrecv; // receive counter (total) + uint32 nduprecv; // duplicate receive counter + uint32 mtu_guess; // Best guess at MTU + } utp_socket_stats; + +#define UTP_IOV_MAX 1024 + + // For utp_writev, to writes data from multiple buffers + struct utp_iovec + { + void *iov_base; + size_t iov_len; + }; + + // Public Functions + utp_context * + utp_init(int version); + void + utp_destroy(utp_context *ctx); + void + utp_set_callback(utp_context *ctx, int callback_name, utp_callback_t *proc); + void * + utp_context_set_userdata(utp_context *ctx, void *userdata); + void * + utp_context_get_userdata(utp_context *ctx); + int + utp_context_set_option(utp_context *ctx, int opt, int val); + int + utp_context_get_option(utp_context *ctx, int opt); + int + utp_process_udp(utp_context *ctx, const byte *buf, size_t len, + const struct sockaddr *to, socklen_t tolen); + int + utp_process_icmp_error(utp_context *ctx, const byte *buffer, size_t len, + const struct sockaddr *to, socklen_t tolen); + int + utp_process_icmp_fragmentation(utp_context *ctx, const byte *buffer, + size_t len, const struct sockaddr *to, + socklen_t tolen, uint16 next_hop_mtu); + void + utp_check_timeouts(utp_context *ctx); + void + utp_issue_deferred_acks(utp_context *ctx); + utp_context_stats * + utp_get_context_stats(utp_context *ctx); + utp_socket * + utp_create_socket(utp_context *ctx); + void * + utp_set_userdata(utp_socket *s, void *userdata); + void * + utp_get_userdata(utp_socket *s); + int + utp_setsockopt(utp_socket *s, int opt, int val); + int + utp_getsockopt(utp_socket *s, int opt); + int + utp_connect(utp_socket *s, const struct sockaddr *to, socklen_t tolen); + ssize_t + utp_write(utp_socket *s, void *buf, size_t count); + ssize_t + utp_writev(utp_socket *s, struct utp_iovec *iovec, size_t num_iovecs); + int + utp_getpeername(utp_socket *s, struct sockaddr *addr, socklen_t *addrlen); + void + utp_read_drained(utp_socket *s); + int + utp_get_delays(utp_socket *s, uint32 *ours, uint32 *theirs, uint32 *age); + utp_socket_stats * + utp_get_stats(utp_socket *s); + utp_context * + utp_get_context(utp_socket *s); + void + utp_shutdown(utp_socket *s, int how); + void + utp_close(utp_socket *s); + +#ifdef __cplusplus +} +#endif + +#endif //__UTP_H__ diff --git a/include/utp_types.h b/include/utp_types.h new file mode 100644 index 000000000..55226a382 --- /dev/null +++ b/include/utp_types.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_TYPES_H__ +#define __UTP_TYPES_H__ + +// Allow libutp consumers or prerequisites to override PACKED_ATTRIBUTE +#ifndef PACKED_ATTRIBUTE +#if defined BROKEN_GCC_STRUCTURE_PACKING && defined __GNUC__ +// Used for gcc tool chains accepting but not supporting pragma pack +// See http://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html +#define PACKED_ATTRIBUTE __attribute__((__packed__)) +#else +#define PACKED_ATTRIBUTE +#endif // defined BROKEN_GCC_STRUCTURE_PACKING && defined __GNUC__ +#endif // ndef PACKED_ATTRIBUTE + +#ifdef __GNUC__ +#define ALIGNED_ATTRIBUTE(x) __attribute__((aligned(x))) +#else +#define ALIGNED_ATTRIBUTE(x) +#endif + +// hash.cpp needs socket definitions, which is why this networking specific +// code is inclued in utypes.h +#ifdef WIN32 +#define _CRT_SECURE_NO_DEPRECATE +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#define IP_OPT_DONTFRAG IP_DONTFRAGMENT +#define SHUT_RD SD_RECEIVE +#define SHUT_WR SD_SEND +#define SHUT_RDWR SD_BOTH +#else +#include +#include +#include +#include + +#ifdef IP_DONTFRAG +#define IP_OPT_DONTFRAG IP_DONTFRAG +#elif defined IP_DONTFRAGMENT +#define IP_OPT_DONTFRAG IP_DONTFRAGMENT +#else +//#warning "I don't know how to set DF bit on this system" +#endif +#endif + +#ifdef _MSC_VER +#include +typedef SSIZE_T ssize_t; +#endif + +#ifdef POSIX +typedef struct sockaddr_storage SOCKADDR_STORAGE; +#endif + +#ifdef WIN32 +#define I64u "%I64u" +#else +#define I64u "%Lu" +#endif + +// standard types +typedef unsigned char byte; +typedef unsigned char uint8; +typedef signed char int8; +typedef unsigned short uint16; +typedef signed short int16; +typedef unsigned int uint; +typedef unsigned int uint32; +typedef signed int int32; + +#ifdef _MSC_VER +typedef unsigned __int64 uint64; +typedef signed __int64 int64; +#else +typedef unsigned long long uint64; +typedef long long int64; +#endif + +/* compile-time assert */ +#ifndef CASSERT +#define CASSERT(exp, name) typedef int is_not_##name[(exp) ? 1 : -1]; +#endif + +CASSERT(8 == sizeof(uint64), sizeof_uint64_is_8) +CASSERT(8 == sizeof(int64), sizeof_int64_is_8) + +#ifndef INT64_MAX +#define INT64_MAX 0x7fffffffffffffffLL +#endif + +// always ANSI +typedef const char* cstr; +typedef char* str; + +#ifndef __cplusplus +typedef uint8 bool; +#endif + +#endif //__UTP_TYPES_H__ diff --git a/libutp/LICENSE b/libutp/LICENSE new file mode 100644 index 000000000..7f6e16c5f --- /dev/null +++ b/libutp/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2010-2013 BitTorrent, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/libutp/Makefile b/libutp/Makefile new file mode 100644 index 000000000..818f3095a --- /dev/null +++ b/libutp/Makefile @@ -0,0 +1,48 @@ +OBJS = utp_internal.o utp_utils.o utp_hash.o utp_callbacks.o utp_api.o utp_packedsockaddr.o +CFLAGS = -Wall -DPOSIX -g -fno-exceptions $(OPT) +OPT ?= -O3 +CXXFLAGS = $(CFLAGS) -fPIC -fno-rtti +CC = gcc +CXX = g++ + +CXXFLAGS += -Wno-sign-compare +CXXFLAGS += -fpermissive + +# Uncomment to enable utp_get_stats(), and a few extra sanity checks +#CFLAGS += -D_DEBUG + +# Uncomment to enable debug logging +#CFLAGS += -DUTP_DEBUG_LOGGING + +# Dynamically determine if librt is available. If so, assume we need to link +# against it for clock_gettime(2). This is required for clean builds on OSX; +# see for more. This should +# probably be ported to CMake at some point, but is suitable for now. +lrt := $(shell echo 'int main() {}' | $(CC) -xc -o /dev/null - -lrt >/dev/null 2>&1; echo $$?) +ifeq ($(strip $(lrt)),0) + LDFLAGS += -lrt +endif + +all: libutp.so libutp.a ucat ucat-static + +libutp.so: $(OBJS) + $(CXX) $(CXXFLAGS) -o libutp.so -shared $(OBJS) + +libutp.a: $(OBJS) + ar rvs libutp.a $(OBJS) + +ucat: ucat.o libutp.so + $(CC) $(CFLAGS) -o ucat ucat.o -L. -lutp $(LDFLAGS) + +ucat-static: ucat.o libutp.a + $(CXX) $(CXXFLAGS) -o ucat-static ucat.o libutp.a $(LDFLAGS) + +clean: + rm -f *.o libutp.so libutp.a ucat ucat-static + +tags: $(shell ls *.cpp *.h) + rm -f tags + ctags *.cpp *.h + +anyway: clean all +.PHONY: clean all anyway diff --git a/libutp/README.md b/libutp/README.md new file mode 100644 index 000000000..a8e20ecac --- /dev/null +++ b/libutp/README.md @@ -0,0 +1,68 @@ +# libutp - The uTorrent Transport Protocol library. +Copyright (c) 2010 BitTorrent, Inc. + +uTP is a TCP-like implementation of [LEDBAT][ledbat] documented as a BitTorrent +extension in [BEP-29][bep29]. uTP provides reliable, ordered delivery +while maintaining minimum extra delay. It is implemented on top of UDP to be +cross-platform and functional today. As a result, uTP is the primary transport +for uTorrent peer-to-peer connections. + +uTP is written in C++, but the external interface is strictly C (ANSI C89). + +## The Interface + +The uTP socket interface is a bit different from the Berkeley socket API to +avoid the need for our own select() implementation, and to make it easier to +write event-based code with minimal buffering. + +When you create a uTP socket, you register a set of callbacks. Most notably, the +on_read callback is a reactive callback which occurs when bytes arrive off the +network. The write side of the socket is proactive, and you call UTP_Write to +indicate the number of bytes you wish to write. As packets are created, the +on_write callback is called for each packet, so you can fill the buffers with +data. + +The libutp interface is not thread-safe. It was designed for use in a +single-threaded asyncronous context, although with proper synchronization +it may be used from a multi-threaded environment as well. + +See utp.h for more details and other API documentation. + +## Example + +See ucat.c. Build with: + + make ucat + +## Building + +uTP has been known to build on Windows with MSVC and on linux and OS X with gcc. +On Windows, use the MSVC project files (utp.sln, and friends). On other platforms, +building the shared library is as simple as: + + make + +To build one of the examples, which will statically link in everything it needs +from libutp: + + cd utp_test && make + +## Packaging and API + +The libutp API is considered unstable, and probably always will be. We encourage +you to test with the version of libutp you have, and be mindful when upgrading. +For this reason, it is probably also a good idea to bundle libutp with your +application. + +## License + +libutp is released under the [MIT][lic] license. + +## Related Work + +Research and analysis of congestion control mechanisms can be found [here.][survey] + +[ledbat]: http://datatracker.ietf.org/wg/ledbat/charter/ +[bep29]: http://www.bittorrent.org/beps/bep_0029.html +[lic]: http://www.opensource.org/licenses/mit-license.php +[survey]: http://datatracker.ietf.org/doc/draft-ietf-ledbat-survey/ diff --git a/libutp/libutp_inet_ntop.cpp b/libutp/libutp_inet_ntop.cpp new file mode 100644 index 000000000..b0edf8024 --- /dev/null +++ b/libutp/libutp_inet_ntop.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#include "libutp_inet_ntop.h" + + +//###################################################################### +const char *libutp::inet_ntop(int af, const void *src, char *dest, size_t length) +{ + if (af != AF_INET && af != AF_INET6) + { + return NULL; + } + + SOCKADDR_STORAGE address; + DWORD address_length; + + if (af == AF_INET) + { + address_length = sizeof(sockaddr_in); + sockaddr_in* ipv4_address = (sockaddr_in*)(&address); + ipv4_address->sin_family = AF_INET; + ipv4_address->sin_port = 0; + memcpy(&ipv4_address->sin_addr, src, sizeof(in_addr)); + } + else // AF_INET6 + { + address_length = sizeof(sockaddr_in6); + sockaddr_in6* ipv6_address = (sockaddr_in6*)(&address); + ipv6_address->sin6_family = AF_INET6; + ipv6_address->sin6_port = 0; + ipv6_address->sin6_flowinfo = 0; + // hmmm + ipv6_address->sin6_scope_id = 0; + memcpy(&ipv6_address->sin6_addr, src, sizeof(in6_addr)); + } + + DWORD string_length = (DWORD)(length); + int result; + result = WSAAddressToStringA((sockaddr*)(&address), + address_length, 0, dest, + &string_length); + + // one common reason for this to fail is that ipv6 is not installed + + return result == SOCKET_ERROR ? NULL : dest; +} + +//###################################################################### +int libutp::inet_pton(int af, const char* src, void* dest) +{ + if (af != AF_INET && af != AF_INET6) + { + return -1; + } + + SOCKADDR_STORAGE address; + int address_length = sizeof(SOCKADDR_STORAGE); + int result = WSAStringToAddressA((char*)(src), af, 0, + (sockaddr*)(&address), + &address_length); + + if (af == AF_INET) + { + if (result != SOCKET_ERROR) + { + sockaddr_in* ipv4_address =(sockaddr_in*)(&address); + memcpy(dest, &ipv4_address->sin_addr, sizeof(in_addr)); + } + else if (strcmp(src, "255.255.255.255") == 0) + { + ((in_addr*)(dest))->s_addr = INADDR_NONE; + } + } + else // AF_INET6 + { + if (result != SOCKET_ERROR) + { + sockaddr_in6* ipv6_address = (sockaddr_in6*)(&address); + memcpy(dest, &ipv6_address->sin6_addr, sizeof(in6_addr)); + } + } + + return result == SOCKET_ERROR ? -1 : 1; +} diff --git a/libutp/libutp_inet_ntop.h b/libutp/libutp_inet_ntop.h new file mode 100644 index 000000000..33881d624 --- /dev/null +++ b/libutp/libutp_inet_ntop.h @@ -0,0 +1,68 @@ +#ifndef LIBUTP_INET_NTOP_H +#define LIBUTP_INET_NTOP_H + +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// About us linking the system inet_pton and inet_ntop symbols: +// 1) These symbols are usually defined on POSIX systems +// 2) They are not defined on Windows versions earlier than Vista +// Defined in: +// ut_utils/src/sockaddr.cpp +// libutp/win32_inet_ntop.obj +// +// When we drop support for XP we can just #include , and use the system functions +// For now, we will always use our functions on windows, on all builds +// The reason is: we would like the debug build to behave as much as the release build as possible +// It is much better to catch a problem in the debug build, than to link the system version +// in debug, and our version int he wild. + +#if defined(_WIN32_WINNT) +#if _WIN32_WINNT >= 0x600 // Win32, post-XP +#include // for inet_ntop, inet_pton +#define INET_NTOP inet_ntop +#define INET_PTON inet_pton +#else +#define INET_NTOP libutp::inet_ntop // Win32, pre-XP: Use ours +#define INET_PTON libutp::inet_pton +#endif +#else // not WIN32 +#include // for inet_ntop, inet_pton +#define INET_NTOP inet_ntop +#define INET_PTON inet_pton +#endif + +//###################################################################### +//###################################################################### +namespace libutp { + + +//###################################################################### +const char *inet_ntop(int af, const void *src, char *dest, size_t length); + +//###################################################################### +int inet_pton(int af, const char* src, void* dest); + + +} //namespace libutp + +#endif // LIBUTP_INET_NTOP_H \ No newline at end of file diff --git a/libutp/parse_log.py b/libutp/parse_log.py new file mode 100644 index 000000000..44f69256e --- /dev/null +++ b/libutp/parse_log.py @@ -0,0 +1,288 @@ +import os, sys, time + +# usage: parse_log.py log-file [socket-index to focus on] + + +socket_filter = None +if len(sys.argv) >= 3: + socket_filter = sys.argv[2].strip() + +if socket_filter == None: + print "scanning for socket with the most packets" + file = open(sys.argv[1], 'rb') + + sockets = {} + + for l in file: + if not 'our_delay' in l: continue + + try: + a = l.strip().split(" ") + socket_index = a[1][:-1] + except: + continue + + # msvc's runtime library doesn't prefix pointers + # with '0x' +# if socket_index[:2] != '0x': +# continue + + if socket_index in sockets: + sockets[socket_index] += 1 + else: + sockets[socket_index] = 1 + + items = sockets.items() + items.sort(lambda x, y: y[1] - x[1]) + + count = 0 + for i in items: + print '%s: %d' % (i[0], i[1]) + count += 1 + if count > 5: break + + file.close() + socket_filter = items[0][0] + print '\nfocusing on socket %s' % socket_filter + +file = open(sys.argv[1], 'rb') +out_file = 'utp.out%s' % socket_filter; +out = open(out_file, 'wb') + +delay_samples = 'dots lc rgb "blue"' +delay_base = 'steps lw 2 lc rgb "purple"' +target_delay = 'steps lw 2 lc rgb "red"' +off_target = 'dots lc rgb "blue"' +cwnd = 'steps lc rgb "green"' +window_size = 'steps lc rgb "sea-green"' +rtt = 'lines lc rgb "light-blue"' + +metrics = { + 'our_delay':['our delay (ms)', 'x1y2', delay_samples], + 'upload_rate':['send rate (B/s)', 'x1y1', 'lines'], + 'max_window':['cwnd (B)', 'x1y1', cwnd], + 'target_delay':['target delay (ms)', 'x1y2', target_delay], + 'cur_window':['bytes in-flight (B)', 'x1y1', window_size], + 'cur_window_packets':['number of packets in-flight', 'x1y2', 'steps'], + 'packet_size':['current packet size (B)', 'x1y2', 'steps'], + 'rtt':['rtt (ms)', 'x1y2', rtt], + 'off_target':['off-target (ms)', 'x1y2', off_target], + 'delay_sum':['delay sum (ms)', 'x1y2', 'steps'], + 'their_delay':['their delay (ms)', 'x1y2', delay_samples], + 'get_microseconds':['clock (us)', 'x1y1', 'steps'], + 'wnduser':['advertised window size (B)', 'x1y1', 'steps'], + + 'delay_base':['delay base (us)', 'x1y1', delay_base], + 'their_delay_base':['their delay base (us)', 'x1y1', delay_base], + 'their_actual_delay':['their actual delay (us)', 'x1y1', delay_samples], + 'actual_delay':['actual_delay (us)', 'x1y1', delay_samples] +} + +histogram_quantization = 1 +socket_index = None + +columns = [] + +begin = None + +title = "-" +packet_loss = 0 +packet_timeout = 0 + +delay_histogram = {} +window_size = {'0': 0, '1': 0} + +# [35301484] 0x00ec1190: actual_delay:1021583 our_delay:102 their_delay:-1021345 off_target:297 max_window:2687 upload_rate:18942 delay_base:1021481154 delay_sum:-1021242 target_delay:400 acked_bytes:1441 cur_window:2882 scaled_gain:2.432 + +counter = 0 + +print "reading log file" + +for l in file: + if "UTP_Connect" in l: + title = l[:-2] + if socket_filter != None: + title += ' socket: %s' % socket_filter + else: + title += ' sum of all sockets' + continue + + try: + a = l.strip().split(" ") + t = a[0][1:-1] + socket_index = a[1][:-1] + except: + continue +# if socket_index[:2] != '0x': +# continue + + if socket_filter != None and socket_index != socket_filter: + continue + + counter += 1 + if (counter % 300 == 0): + print "\r%d " % counter, + + if "lost." in l: + packet_loss = packet_loss + 1 + continue + if "Packet timeout" in l: + packet_timeout = packet_timeout + 1 + continue + if "our_delay:" not in l: + continue + +# used for Logf timestamps +# t, m = t.split(".") +# t = time.strptime(t, "%H:%M:%S") +# t = list(t) +# t[0] += 107 +# t = tuple(t) +# m = float(m) +# m /= 1000.0 +# t = time.mktime(t) + m + +# used for tick count timestamps + t = int(t) + + if begin is None: + begin = t + t = t - begin + # print time. Convert from milliseconds to seconds + print >>out, '%f\t' % (float(t)/1000.), + + #if t > 200000: + # break + + fill_columns = not columns + for i in a[2:]: + try: + n, v = i.split(':') + except: + continue + v = float(v) + if n == "our_delay": + bucket = v / histogram_quantization + delay_histogram[bucket] = 1 + delay_histogram.get(bucket, 0) + if not n in metrics: continue + if fill_columns: + columns.append(n) + if n == "max_window": + window_size[socket_index] = v + print >>out, '%f\t' % int(reduce(lambda a,b: a+b, window_size.values())), + else: + print >>out, '%f\t' % v, + print >>out, float(packet_loss * 8000), float(packet_timeout * 8000) + packet_loss = 0 + packet_timeout = 0 + +out.close() + +out = open('%s.histogram' % out_file, 'wb') +for d,f in delay_histogram.iteritems(): + print >>out, float(d*histogram_quantization) + histogram_quantization / 2, f +out.close() + + +plot = [ + { + 'data': ['upload_rate', 'max_window', 'cur_window', 'wnduser', 'cur_window_packets', 'packet_size', 'rtt'], + 'title': 'send-packet-size', + 'y1': 'Bytes', + 'y2': 'Time (ms)' + }, + { + 'data': ['our_delay', 'max_window', 'target_delay', 'cur_window', 'wnduser', 'cur_window_packets'], + 'title': 'uploading', + 'y1': 'Bytes', + 'y2': 'Time (ms)' + }, + { + 'data': ['our_delay', 'max_window', 'target_delay', 'cur_window', 'cur_window_packets'], + 'title': 'uploading_packets', + 'y1': 'Bytes', + 'y2': 'Time (ms)' + }, + { + 'data': ['get_microseconds'], + 'title': 'timer', + 'y1': 'Time microseconds', + 'y2': 'Time (ms)' + }, + { + 'data': ['their_delay', 'target_delay', 'rtt'], + 'title': 'their_delay', + 'y1': '', + 'y2': 'Time (ms)' + }, + { + 'data': ['their_actual_delay','their_delay_base'], + 'title': 'their_delay_base', + 'y1': 'Time (us)', + 'y2': '' + }, + { + 'data': ['our_delay', 'target_delay', 'rtt'], + 'title': 'our-delay', + 'y1': '', + 'y2': 'Time (ms)' + }, + { + 'data': ['actual_delay', 'delay_base'], + 'title': 'our_delay_base', + 'y1': 'Time (us)', + 'y2': '' + } +] + +out = open('utp.gnuplot', 'w+') + +files = '' + +#print >>out, 'set xtics 0, 20' +print >>out, "set term png size 1280,800" +print >>out, 'set output "%s.delays.png"' % out_file +print >>out, 'set xrange [0:250]' +print >>out, 'set xlabel "delay (ms)"' +print >>out, 'set boxwidth 1' +print >>out, 'set style fill solid' +print >>out, 'set ylabel "number of packets"' +print >>out, 'plot "%s.histogram" using 1:2 with boxes' % out_file + +print >>out, "set style data steps" +#print >>out, "set yrange [0:*]" +print >>out, "set y2range [*:*]" +files += out_file + '.delays.png ' +#set hidden3d +#set title "Peer bandwidth distribution" +#set xlabel "Ratio" + +for p in plot: + print >>out, 'set title "%s %s"' % (p['title'], title) + print >>out, 'set xlabel "time (s)"' + print >>out, 'set ylabel "%s"' % p['y1'] + print >>out, "set tics nomirror" + print >>out, 'set y2tics' + print >>out, 'set y2label "%s"' % p['y2'] + print >>out, 'set xrange [0:*]' + print >>out, "set key box" + print >>out, "set term png size 1280,800" + print >>out, 'set output "%s-%s.png"' % (out_file, p['title']) + files += '%s-%s.png ' % (out_file, p['title']) + + comma = '' + print >>out, "plot", + + for c in p['data']: + if not c in metrics: continue + i = columns.index(c) + print >>out, '%s"%s" using 1:%d title "%s-%s" axes %s with %s' % (comma, out_file, i + 2, metrics[c][0], metrics[c][1], metrics[c][1], metrics[c][2]), + comma = ', ' + print >>out, '' + +out.close() + +os.system("gnuplot utp.gnuplot") + +os.system("open %s" % files) + diff --git a/libutp/utp_api.cpp b/libutp/utp_api.cpp new file mode 100644 index 000000000..63aff189c --- /dev/null +++ b/libutp/utp_api.cpp @@ -0,0 +1,139 @@ +// vim:set ts=4 sw=4 ai: + +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include "utp_internal.h" +#include "utp_utils.h" + +extern "C" { + +const char * utp_callback_names[] = { + "UTP_ON_FIREWALL", + "UTP_ON_ACCEPT", + "UTP_ON_CONNECT", + "UTP_ON_ERROR", + "UTP_ON_READ", + "UTP_ON_OVERHEAD_STATISTICS", + "UTP_ON_STATE_CHANGE", + "UTP_GET_READ_BUFFER_SIZE", + "UTP_ON_DELAY_SAMPLE", + "UTP_GET_UDP_MTU", + "UTP_GET_UDP_OVERHEAD", + "UTP_GET_MILLISECONDS", + "UTP_GET_MICROSECONDS", + "UTP_GET_RANDOM", + "UTP_LOG", + "UTP_SENDTO", +}; + +const char * utp_error_code_names[] = { + "UTP_ECONNREFUSED", + "UTP_ECONNRESET", + "UTP_ETIMEDOUT", +}; + +const char *utp_state_names[] = { + NULL, + "UTP_STATE_CONNECT", + "UTP_STATE_WRITABLE", + "UTP_STATE_EOF", + "UTP_STATE_DESTROYING", +}; + +struct_utp_context::struct_utp_context() + : userdata(NULL) + , current_ms(0) + , last_utp_socket(NULL) + , log_normal(false) + , log_mtu(false) + , log_debug(false) +{ + memset(&context_stats, 0, sizeof(context_stats)); + memset(callbacks, 0, sizeof(callbacks)); + target_delay = CCONTROL_TARGET; + utp_sockets = new UTPSocketHT; + + callbacks[UTP_GET_UDP_MTU] = &utp_default_get_udp_mtu; + callbacks[UTP_GET_UDP_OVERHEAD] = &utp_default_get_udp_overhead; + callbacks[UTP_GET_MILLISECONDS] = &utp_default_get_milliseconds; + callbacks[UTP_GET_MICROSECONDS] = &utp_default_get_microseconds; + callbacks[UTP_GET_RANDOM] = &utp_default_get_random; + + // 1 MB of receive buffer (i.e. max bandwidth delay product) + // means that from a peer with 200 ms RTT, we cannot receive + // faster than 5 MB/s + // from a peer with 10 ms RTT, we cannot receive faster than + // 100 MB/s. This is assumed to be good enough, since bandwidth + // often is proportional to RTT anyway + // when setting a download rate limit, all sockets should have + // their receive buffer set much lower, to say 60 kiB or so + opt_rcvbuf = opt_sndbuf = 1024 * 1024; + last_check = 0; +} + +struct_utp_context::~struct_utp_context() { + delete this->utp_sockets; +} + +utp_context* utp_init (int version) +{ + assert(version == 2); + if (version != 2) + return NULL; + utp_context *ctx = new utp_context; + return ctx; +} + +void utp_destroy(utp_context *ctx) { + assert(ctx); + if (ctx) delete ctx; +} + +void utp_set_callback(utp_context *ctx, int callback_name, utp_callback_t *proc) { + assert(ctx); + if (ctx) ctx->callbacks[callback_name] = proc; +} + +void* utp_context_set_userdata(utp_context *ctx, void *userdata) { + assert(ctx); + if (ctx) ctx->userdata = userdata; + return ctx ? ctx->userdata : NULL; +} + +void* utp_context_get_userdata(utp_context *ctx) { + assert(ctx); + return ctx ? ctx->userdata : NULL; +} + +utp_context_stats* utp_get_context_stats(utp_context *ctx) { + assert(ctx); + return ctx ? &ctx->context_stats : NULL; +} + +ssize_t utp_write(utp_socket *socket, void *buf, size_t len) { + struct utp_iovec iovec = { buf, len }; + return utp_writev(socket, &iovec, 1); +} + +} diff --git a/libutp/utp_callbacks.cpp b/libutp/utp_callbacks.cpp new file mode 100644 index 000000000..9540d8c40 --- /dev/null +++ b/libutp/utp_callbacks.cpp @@ -0,0 +1,208 @@ +// vim:set ts=4 sw=4 ai: + +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "utp_callbacks.h" + +int utp_call_on_firewall(utp_context *ctx, const struct sockaddr *address, socklen_t address_len) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_FIREWALL]) return 0; + args.callback_type = UTP_ON_FIREWALL; + args.context = ctx; + args.socket = NULL; + args.address = address; + args.address_len = address_len; + return (int)ctx->callbacks[UTP_ON_FIREWALL](&args); +} + +void utp_call_on_accept(utp_context *ctx, utp_socket *socket, const struct sockaddr *address, socklen_t address_len) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_ACCEPT]) return; + args.callback_type = UTP_ON_ACCEPT; + args.context = ctx; + args.socket = socket; + args.address = address; + args.address_len = address_len; + ctx->callbacks[UTP_ON_ACCEPT](&args); +} + +void utp_call_on_connect(utp_context *ctx, utp_socket *socket) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_CONNECT]) return; + args.callback_type = UTP_ON_CONNECT; + args.context = ctx; + args.socket = socket; + ctx->callbacks[UTP_ON_CONNECT](&args); +} + +void utp_call_on_error(utp_context *ctx, utp_socket *socket, int error_code) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_ERROR]) return; + args.callback_type = UTP_ON_ERROR; + args.context = ctx; + args.socket = socket; + args.error_code = error_code; + ctx->callbacks[UTP_ON_ERROR](&args); +} + +void utp_call_on_read(utp_context *ctx, utp_socket *socket, const byte *buf, size_t len) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_READ]) return; + args.callback_type = UTP_ON_READ; + args.context = ctx; + args.socket = socket; + args.buf = buf; + args.len = len; + ctx->callbacks[UTP_ON_READ](&args); +} + +void utp_call_on_overhead_statistics(utp_context *ctx, utp_socket *socket, int send, size_t len, int type) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_OVERHEAD_STATISTICS]) return; + args.callback_type = UTP_ON_OVERHEAD_STATISTICS; + args.context = ctx; + args.socket = socket; + args.send = send; + args.len = len; + args.type = type; + ctx->callbacks[UTP_ON_OVERHEAD_STATISTICS](&args); +} + +void utp_call_on_delay_sample(utp_context *ctx, utp_socket *socket, int sample_ms) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_DELAY_SAMPLE]) return; + args.callback_type = UTP_ON_DELAY_SAMPLE; + args.context = ctx; + args.socket = socket; + args.sample_ms = sample_ms; + ctx->callbacks[UTP_ON_DELAY_SAMPLE](&args); +} + +void utp_call_on_state_change(utp_context *ctx, utp_socket *socket, int state) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_ON_STATE_CHANGE]) return; + args.callback_type = UTP_ON_STATE_CHANGE; + args.context = ctx; + args.socket = socket; + args.state = state; + ctx->callbacks[UTP_ON_STATE_CHANGE](&args); +} + +uint16 utp_call_get_udp_mtu(utp_context *ctx, utp_socket *socket, const struct sockaddr *address, socklen_t address_len) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_UDP_MTU]) return 0; + args.callback_type = UTP_GET_UDP_MTU; + args.context = ctx; + args.socket = socket; + args.address = address; + args.address_len = address_len; + return (uint16)ctx->callbacks[UTP_GET_UDP_MTU](&args); +} + +uint16 utp_call_get_udp_overhead(utp_context *ctx, utp_socket *socket, const struct sockaddr *address, socklen_t address_len) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_UDP_OVERHEAD]) return 0; + args.callback_type = UTP_GET_UDP_OVERHEAD; + args.context = ctx; + args.socket = socket; + args.address = address; + args.address_len = address_len; + return (uint16)ctx->callbacks[UTP_GET_UDP_OVERHEAD](&args); +} + +uint64 utp_call_get_milliseconds(utp_context *ctx, utp_socket *socket) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_MILLISECONDS]) return 0; + args.callback_type = UTP_GET_MILLISECONDS; + args.context = ctx; + args.socket = socket; + return ctx->callbacks[UTP_GET_MILLISECONDS](&args); +} + +uint64 utp_call_get_microseconds(utp_context *ctx, utp_socket *socket) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_MICROSECONDS]) return 0; + args.callback_type = UTP_GET_MICROSECONDS; + args.context = ctx; + args.socket = socket; + return ctx->callbacks[UTP_GET_MICROSECONDS](&args); +} + +uint32 utp_call_get_random(utp_context *ctx, utp_socket *socket) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_RANDOM]) return 0; + args.callback_type = UTP_GET_RANDOM; + args.context = ctx; + args.socket = socket; + return (uint32)ctx->callbacks[UTP_GET_RANDOM](&args); +} + +size_t utp_call_get_read_buffer_size(utp_context *ctx, utp_socket *socket) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_GET_READ_BUFFER_SIZE]) return 0; + args.callback_type = UTP_GET_READ_BUFFER_SIZE; + args.context = ctx; + args.socket = socket; + return (size_t)ctx->callbacks[UTP_GET_READ_BUFFER_SIZE](&args); +} + +void utp_call_log(utp_context *ctx, utp_socket *socket, const byte *buf) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_LOG]) return; + args.callback_type = UTP_LOG; + args.context = ctx; + args.socket = socket; + args.buf = buf; + ctx->callbacks[UTP_LOG](&args); +} + +void utp_call_sendto(utp_context *ctx, utp_socket *socket, const byte *buf, size_t len, const struct sockaddr *address, socklen_t address_len, uint32 flags) +{ + utp_callback_arguments args; + if (!ctx->callbacks[UTP_SENDTO]) return; + args.callback_type = UTP_SENDTO; + args.context = ctx; + args.socket = socket; + args.buf = buf; + args.len = len; + args.address = address; + args.address_len = address_len; + args.flags = flags; + ctx->callbacks[UTP_SENDTO](&args); +} + diff --git a/libutp/utp_callbacks.h b/libutp/utp_callbacks.h new file mode 100644 index 000000000..649e7e14f --- /dev/null +++ b/libutp/utp_callbacks.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_CALLBACKS_H__ +#define __UTP_CALLBACKS_H__ + +#include "utp.h" +#include "utp_internal.h" + +// Generated by running: grep ^[a-z] utp_callbacks.cpp | sed 's/$/;/' +int utp_call_on_firewall(utp_context *ctx, const struct sockaddr *address, socklen_t address_len); +void utp_call_on_accept(utp_context *ctx, utp_socket *s, const struct sockaddr *address, socklen_t address_len); +void utp_call_on_connect(utp_context *ctx, utp_socket *s); +void utp_call_on_error(utp_context *ctx, utp_socket *s, int error_code); +void utp_call_on_read(utp_context *ctx, utp_socket *s, const byte *buf, size_t len); +void utp_call_on_overhead_statistics(utp_context *ctx, utp_socket *s, int send, size_t len, int type); +void utp_call_on_delay_sample(utp_context *ctx, utp_socket *s, int sample_ms); +void utp_call_on_state_change(utp_context *ctx, utp_socket *s, int state); +uint16 utp_call_get_udp_mtu(utp_context *ctx, utp_socket *s, const struct sockaddr *address, socklen_t address_len); +uint16 utp_call_get_udp_overhead(utp_context *ctx, utp_socket *s, const struct sockaddr *address, socklen_t address_len); +uint64 utp_call_get_milliseconds(utp_context *ctx, utp_socket *s); +uint64 utp_call_get_microseconds(utp_context *ctx, utp_socket *s); +uint32 utp_call_get_random(utp_context *ctx, utp_socket *s); +size_t utp_call_get_read_buffer_size(utp_context *ctx, utp_socket *s); +void utp_call_log(utp_context *ctx, utp_socket *s, const byte *buf); +void utp_call_sendto(utp_context *ctx, utp_socket *s, const byte *buf, size_t len, const struct sockaddr *address, socklen_t address_len, uint32 flags); + +#endif // __UTP_CALLBACKS_H__ diff --git a/libutp/utp_hash.cpp b/libutp/utp_hash.cpp new file mode 100644 index 000000000..a4a71d906 --- /dev/null +++ b/libutp/utp_hash.cpp @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "utp_hash.h" +#include "utp_types.h" + +#define LIBUTP_HASH_UNUSED ((utp_link_t)-1) + +#ifdef STRICT_ALIGN +inline uint32 Read32(const void *p) +{ + uint32 tmp; + memcpy(&tmp, p, sizeof tmp); + return tmp; +} + +#else +inline uint32 Read32(const void *p) { return *(uint32*)p; } +#endif + + +// Get the amount of memory required for the hash parameters and the bucket set +// Waste a space for an unused bucket in order to ensure the following managed memory have 32-bit aligned addresses +// TODO: make this 64-bit clean +#define BASE_SIZE(bc) (sizeof(utp_hash_t) + sizeof(utp_link_t) * ((bc) + 1)) + +// Get a pointer to the base of the structure array managed by the hash table +#define get_bep(h) ((byte*)(h)) + BASE_SIZE((h)->N) + +// Get the address of the information associated with a specific structure in the array, +// given the address of the base of the structure. +// This assumes a utp_link_t link member is at the end of the structure. +// Given compilers filling out the memory to a 32-bit clean value, this may mean that +// the location named in the structure may not be the location actually used by the hash table, +// since the compiler may have padded the end of the structure with 2 bytes after the utp_link_t member. +// TODO: this macro should not require that the variable pointing at the hash table be named 'hash' +#define ptr_to_link(p) (utp_link_t *) (((byte *) (p)) + hash->E - sizeof(utp_link_t)) + +// Calculate how much to allocate for a hash table with bucket count, total size, and structure count +// TODO: make this 64-bit clean +#define ALLOCATION_SIZE(bc, ts, sc) (BASE_SIZE((bc)) + (ts) * (sc)) + +utp_hash_t *utp_hash_create(int N, int key_size, int total_size, int initial, utp_hash_compute_t hashfun, utp_hash_equal_t compfun) +{ + // Must have odd number of hash buckets (prime number is best) + assert(N % 2); + // Ensure structures will be at aligned memory addresses + // TODO: make this 64-bit clean + assert(0 == (total_size % 4)); + + int size = ALLOCATION_SIZE(N, total_size, initial); + utp_hash_t *hash = (utp_hash_t *) malloc( size ); + memset( hash, 0, size ); + + for (int i = 0; i < N + 1; ++i) + hash->inits[i] = LIBUTP_HASH_UNUSED; + hash->N = N; + hash->K = key_size; + hash->E = total_size; + hash->hash_compute = hashfun; + hash->hash_equal = compfun; + hash->allocated = initial; + hash->count = 0; + hash->used = 0; + hash->free = LIBUTP_HASH_UNUSED; + return hash; +} + +uint utp_hash_mem(const void *keyp, size_t keysize) +{ + uint hash = 0; + uint n = keysize; + while (n >= 4) { + hash ^= Read32(keyp); + keyp = (byte*)keyp + sizeof(uint32); + hash = (hash << 13) | (hash >> 19); + n -= 4; + } + while (n != 0) { + hash ^= *(byte*)keyp; + keyp = (byte*)keyp + sizeof(byte); + hash = (hash << 8) | (hash >> 24); + n--; + } + return hash; +} + +uint utp_hash_mkidx(utp_hash_t *hash, const void *keyp) +{ + // Generate a key from the hash + return hash->hash_compute(keyp, hash->K) % hash->N; +} + +static inline bool compare(byte *a, byte *b,int n) +{ + assert(n >= 4); + if (Read32(a) != Read32(b)) return false; + return memcmp(a+4, b+4, n-4) == 0; +} + +#define COMPARE(h,k1,k2,ks) (((h)->hash_equal) ? (h)->hash_equal((void*)k1,(void*)k2,ks) : compare(k1,k2,ks)) + +// Look-up a key in the hash table. +// Returns NULL if not found +void *utp_hash_lookup(utp_hash_t *hash, const void *key) +{ + utp_link_t idx = utp_hash_mkidx(hash, key); + + // base pointer + byte *bep = get_bep(hash); + + utp_link_t cur = hash->inits[idx]; + while (cur != LIBUTP_HASH_UNUSED) { + byte *key2 = bep + (cur * hash->E); + if (COMPARE(hash, (byte*)key, key2, hash->K)) + return key2; + cur = *ptr_to_link(key2); + } + + return NULL; +} + +// Add a new element to the hash table. +// Returns a pointer to the new element. +// This assumes the element is not already present! +void *utp_hash_add(utp_hash_t **hashp, const void *key) +{ + //Allocate a new entry + byte *elemp; + utp_link_t elem; + utp_hash_t *hash = *hashp; + utp_link_t idx = utp_hash_mkidx(hash, key); + + if ((elem=hash->free) == LIBUTP_HASH_UNUSED) { + utp_link_t all = hash->allocated; + if (hash->used == all) { + utp_hash_t *nhash; + if (all <= (LIBUTP_HASH_UNUSED/2)) { + all *= 2; + } else if (all != LIBUTP_HASH_UNUSED) { + all = LIBUTP_HASH_UNUSED; + } else { + // too many items! can't grow! + assert(0); + return NULL; + } + // otherwise need to allocate. + nhash = (utp_hash_t*)realloc(hash, ALLOCATION_SIZE(hash->N, hash->E, all)); + if (!nhash) { + // out of memory (or too big to allocate) + assert(nhash); + return NULL; + } + hash = *hashp = nhash; + hash->allocated = all; + } + + elem = hash->used++; + elemp = get_bep(hash) + elem * hash->E; + } else { + elemp = get_bep(hash) + elem * hash->E; + hash->free = *ptr_to_link(elemp); + } + + *ptr_to_link(elemp) = hash->inits[idx]; + hash->inits[idx] = elem; + hash->count++; + + // copy key into it + memcpy(elemp, key, hash->K); + return elemp; +} + +// Delete an element from the utp_hash_t +// Returns a pointer to the already deleted element. +void *utp_hash_del(utp_hash_t *hash, const void *key) +{ + utp_link_t idx = utp_hash_mkidx(hash, key); + + // base pointer + byte *bep = get_bep(hash); + + utp_link_t *curp = &hash->inits[idx]; + utp_link_t cur; + while ((cur=*curp) != LIBUTP_HASH_UNUSED) { + byte *key2 = bep + (cur * hash->E); + if (COMPARE(hash,(byte*)key,(byte*)key2, hash->K )) { + // found an item that matched. unlink it + *curp = *ptr_to_link(key2); + // Insert into freelist + *ptr_to_link(key2) = hash->free; + hash->free = cur; + hash->count--; + return key2; + } + curp = ptr_to_link(key2); + } + + return NULL; +} + +void *utp_hash_iterate(utp_hash_t *hash, utp_hash_iterator_t *iter) +{ + utp_link_t elem; + + if ((elem=iter->elem) == LIBUTP_HASH_UNUSED) { + // Find a bucket with an element + utp_link_t buck = iter->bucket + 1; + for(;;) { + if (buck >= hash->N) + return NULL; + if ((elem = hash->inits[buck]) != LIBUTP_HASH_UNUSED) + break; + buck++; + } + iter->bucket = buck; + } + + byte *elemp = get_bep(hash) + (elem * hash->E); + iter->elem = *ptr_to_link(elemp); + return elemp; +} + +void utp_hash_free_mem(utp_hash_t* hash) +{ + free(hash); +} diff --git a/libutp/utp_hash.h b/libutp/utp_hash.h new file mode 100644 index 000000000..72c17e3bd --- /dev/null +++ b/libutp/utp_hash.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_HASH_H__ +#define __UTP_HASH_H__ + +#include // memset +#include // malloc + +#include "utp_types.h" +#include "utp_templates.h" + +// TODO: make utp_link_t a template parameter to HashTable +typedef uint32 utp_link_t; + +#ifdef _MSC_VER +// Silence the warning about the C99-compliant zero-length array at the end of the structure +#pragma warning (disable: 4200) +#endif + +typedef uint32 (*utp_hash_compute_t)(const void *keyp, size_t keysize); +typedef uint (*utp_hash_equal_t)(const void *key_a, const void *key_b, size_t keysize); + +// In memory the HashTable is laid out as follows: +// ---------------------------- low +// | hash table data members | +// ---------------------------- _ +// | indices | ^ +// | . | | utp_link_t indices into the key-values. +// | . | . +// ---------------------------- - <----- bep +// | keys and values | each key-value pair has size total_size +// | . | +// | . | +// ---------------------------- high +// +// The code depends on the ability of the compiler to pad the length +// of the hash table data members structure to +// a length divisible by 32-bits with no remainder. +// +// Since the number of hash buckets (indices) should be odd, the code +// asserts this and adds one to the hash bucket count to ensure that the +// following key-value pairs array starts on a 32-bit boundary. +// +// The key-value pairs array should start on a 32-bit boundary, otherwise +// processors like the ARM will silently mangle 32-bit data in these structures +// (e.g., turning 0xABCD into 0XCDAB when moving a value from memory to register +// when the memory address is 16 bits offset from a 32-bit boundary), +// also, the value will be stored at an address two bytes lower than the address +// value would ordinarily indicate. +// +// The key-value pair is of type T. The first field in T must +// be the key, i.e., the first K bytes of T contains the key. +// total_size = sizeof(T) and thus sizeof(T) >= sizeof(K) +// +// N is the number of buckets. +// +struct utp_hash_t { + utp_link_t N; + byte K; + byte E; + size_t count; + utp_hash_compute_t hash_compute; + utp_hash_equal_t hash_equal; + utp_link_t allocated; + utp_link_t used; + utp_link_t free; + utp_link_t inits[0]; +}; + +#ifdef _MSC_VER +#pragma warning (default: 4200) +#endif + +struct utp_hash_iterator_t { + utp_link_t bucket; + utp_link_t elem; + + utp_hash_iterator_t() : bucket(0xffffffff), elem(0xffffffff) {} +}; + +uint utp_hash_mem(const void *keyp, size_t keysize); +uint utp_hash_comp(const void *key_a, const void *key_b, size_t keysize); + +utp_hash_t *utp_hash_create(int N, int key_size, int total_size, int initial, utp_hash_compute_t hashfun = utp_hash_mem, utp_hash_equal_t eqfun = NULL); +void *utp_hash_lookup(utp_hash_t *hash, const void *key); +void *utp_hash_add(utp_hash_t **hashp, const void *key); +void *utp_hash_del(utp_hash_t *hash, const void *key); + +void *utp_hash_iterate(utp_hash_t *hash, utp_hash_iterator_t *iter); +void utp_hash_free_mem(utp_hash_t *hash); + +/* + This HashTable requires that T have at least sizeof(K)+sizeof(utp_link_t) bytes. + Usually done like this: + + struct K { + int whatever; + }; + + struct T { + K wtf; + utp_link_t link; // also wtf + }; +*/ + +template class utpHashTable { + utp_hash_t *hash; +public: + static uint compare(const void *k1, const void *k2, size_t ks) { + return *((K*)k1) == *((K*)k2); + } + static uint32 compute_hash(const void *k, size_t ks) { + return ((K*)k)->compute_hash(); + } + void Init() { hash = NULL; } + bool Allocated() { return (hash != NULL); } + void Free() { utp_hash_free_mem(hash); hash = NULL; } + void Create(int N, int initial) { hash = utp_hash_create(N, sizeof(K), sizeof(T), initial, &compute_hash, &compare); } + T *Lookup(const K &key) { return (T*)utp_hash_lookup(hash, &key); } + T *Add(const K &key) { return (T*)utp_hash_add(&hash, &key); } + T *Delete(const K &key) { return (T*)utp_hash_del(hash, &key); } + T *Iterate(utp_hash_iterator_t &iterator) { return (T*)utp_hash_iterate(hash, &iterator); } + size_t GetCount() { return hash->count; } +}; + +#endif //__UTP_HASH_H__ diff --git a/libutp/utp_internal.cpp b/libutp/utp_internal.cpp new file mode 100644 index 000000000..7d6fdc1aa --- /dev/null +++ b/libutp/utp_internal.cpp @@ -0,0 +1,3489 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include // for UINT_MAX +#include + +#include "utp_types.h" +#include "utp_packedsockaddr.h" +#include "utp_internal.h" +#include "utp_hash.h" + +#define TIMEOUT_CHECK_INTERVAL 500 + +// number of bytes to increase max window size by, per RTT. This is +// scaled down linearly proportional to off_target. i.e. if all packets +// in one window have 0 delay, window size will increase by this number. +// Typically it's less. TCP increases one MSS per RTT, which is 1500 +#define MAX_CWND_INCREASE_BYTES_PER_RTT 3000 +#define CUR_DELAY_SIZE 3 +// experiments suggest that a clock skew of 10 ms per 325 seconds +// is not impossible. Reset delay_base every 13 minutes. The clock +// skew is dealt with by observing the delay base in the other +// direction, and adjusting our own upwards if the opposite direction +// delay base keeps going down +#define DELAY_BASE_HISTORY 13 +#define MAX_WINDOW_DECAY 100 // ms + +#define REORDER_BUFFER_SIZE 32 +#define REORDER_BUFFER_MAX_SIZE 1024 +#define OUTGOING_BUFFER_MAX_SIZE 1024 + +#define PACKET_SIZE 1435 + +// this is the minimum max_window value. It can never drop below this +#define MIN_WINDOW_SIZE 10 + +// if we receive 4 or more duplicate acks, we resend the packet +// that hasn't been acked yet +#define DUPLICATE_ACKS_BEFORE_RESEND 3 + +// Allow a reception window of at least 3 ack_nrs behind seq_nr +// A non-SYN packet with an ack_nr difference greater than this is +// considered suspicious and ignored +#define ACK_NR_ALLOWED_WINDOW DUPLICATE_ACKS_BEFORE_RESEND + +#define RST_INFO_TIMEOUT 10000 +#define RST_INFO_LIMIT 1000 +// 29 seconds determined from measuring many home NAT devices +#define KEEPALIVE_INTERVAL 29000 + + +#define SEQ_NR_MASK 0xFFFF +#define ACK_NR_MASK 0xFFFF +#define TIMESTAMP_MASK 0xFFFFFFFF + +#define DIV_ROUND_UP(num, denom) ((num + denom - 1) / denom) + +// The totals are derived from the following data: +// 45: IPv6 address including embedded IPv4 address +// 11: Scope Id +// 2: Brackets around IPv6 address when port is present +// 6: Port (including colon) +// 1: Terminating null byte +char addrbuf[65]; +#define addrfmt(x, s) x.fmt(s, sizeof(s)) + + +#if (defined(__SVR4) && defined(__sun)) + #pragma pack(1) +#else + #pragma pack(push,1) +#endif + + +// these packet sizes are including the uTP header wich +// is either 20 or 23 bytes depending on version +#define PACKET_SIZE_EMPTY_BUCKET 0 +#define PACKET_SIZE_EMPTY 23 +#define PACKET_SIZE_SMALL_BUCKET 1 +#define PACKET_SIZE_SMALL 373 +#define PACKET_SIZE_MID_BUCKET 2 +#define PACKET_SIZE_MID 723 +#define PACKET_SIZE_BIG_BUCKET 3 +#define PACKET_SIZE_BIG 1400 +#define PACKET_SIZE_HUGE_BUCKET 4 + +struct PACKED_ATTRIBUTE PacketFormatV1 { + // packet_type (4 high bits) + // protocol version (4 low bits) + byte ver_type; + byte version() const { return ver_type & 0xf; } + byte type() const { return ver_type >> 4; } + void set_version(byte v) { ver_type = (ver_type & 0xf0) | (v & 0xf); } + void set_type(byte t) { ver_type = (ver_type & 0xf) | (t << 4); } + + // Type of the first extension header + byte ext; + // connection ID + uint16_big connid; + uint32_big tv_usec; + uint32_big reply_micro; + // receive window size in bytes + uint32_big windowsize; + // Sequence number + uint16_big seq_nr; + // Acknowledgment number + uint16_big ack_nr; +}; + +struct PACKED_ATTRIBUTE PacketFormatAckV1 { + PacketFormatV1 pf; + byte ext_next; + byte ext_len; + byte acks[4]; +}; + +#if (defined(__SVR4) && defined(__sun)) + #pragma pack(0) +#else + #pragma pack(pop) +#endif + +enum { + ST_DATA = 0, // Data packet. + ST_FIN = 1, // Finalize the connection. This is the last packet. + ST_STATE = 2, // State packet. Used to transmit an ACK with no data. + ST_RESET = 3, // Terminate connection forcefully. + ST_SYN = 4, // Connect SYN + ST_NUM_STATES, // used for bounds checking +}; + +static const cstr flagnames[] = { + "ST_DATA","ST_FIN","ST_STATE","ST_RESET","ST_SYN" +}; + +enum CONN_STATE { + CS_UNINITIALIZED = 0, + CS_IDLE, + CS_SYN_SENT, + CS_SYN_RECV, + CS_CONNECTED, + CS_CONNECTED_FULL, + CS_RESET, + CS_DESTROY +}; + +static const cstr statenames[] = { + "UNINITIALIZED", "IDLE","SYN_SENT", "SYN_RECV", "CONNECTED","CONNECTED_FULL","DESTROY_DELAY","RESET","DESTROY" +}; + +struct OutgoingPacket { + size_t length; + size_t payload; + uint64 time_sent; // microseconds + uint transmissions:31; + bool need_resend:1; + byte data[1]; +}; + +struct SizableCircularBuffer { + // This is the mask. Since it's always a power of 2, adding 1 to this value will return the size. + size_t mask; + // This is the elements that the circular buffer points to + void **elements; + + void *get(size_t i) const { assert(elements); return elements ? elements[i & mask] : NULL; } + void put(size_t i, void *data) { assert(elements); elements[i&mask] = data; } + + void grow(size_t item, size_t index); + void ensure_size(size_t item, size_t index) { if (index > mask) grow(item, index); } + size_t size() { return mask + 1; } +}; + +// Item contains the element we want to make space for +// index is the index in the list. +void SizableCircularBuffer::grow(size_t item, size_t index) +{ + // Figure out the new size. + size_t size = mask + 1; + do size *= 2; while (index >= size); + + // Allocate the new buffer + void **buf = (void**)calloc(size, sizeof(void*)); + + size--; + + // Copy elements from the old buffer to the new buffer + for (size_t i = 0; i <= mask; i++) { + buf[(item - index + i) & size] = get(item - index + i); + } + + // Swap to the newly allocated buffer + mask = size; + free(elements); + elements = buf; +} + +// compare if lhs is less than rhs, taking wrapping +// into account. if lhs is close to UINT_MAX and rhs +// is close to 0, lhs is assumed to have wrapped and +// considered smaller +bool wrapping_compare_less(uint32 lhs, uint32 rhs, uint32 mask) +{ + // distance walking from lhs to rhs, downwards + const uint32 dist_down = (lhs - rhs) & mask; + // distance walking from lhs to rhs, upwards + const uint32 dist_up = (rhs - lhs) & mask; + + // if the distance walking up is shorter, lhs + // is less than rhs. If the distance walking down + // is shorter, then rhs is less than lhs + return dist_up < dist_down; +} + +struct DelayHist { + uint32 delay_base; + + // this is the history of delay samples, + // normalized by using the delay_base. These + // values are always greater than 0 and measures + // the queuing delay in microseconds + uint32 cur_delay_hist[CUR_DELAY_SIZE]; + size_t cur_delay_idx; + + // this is the history of delay_base. It's + // a number that doesn't have an absolute meaning + // only relative. It doesn't make sense to initialize + // it to anything other than values relative to + // what's been seen in the real world. + uint32 delay_base_hist[DELAY_BASE_HISTORY]; + size_t delay_base_idx; + // the time when we last stepped the delay_base_idx + uint64 delay_base_time; + + bool delay_base_initialized; + + void clear(uint64 current_ms) + { + delay_base_initialized = false; + delay_base = 0; + cur_delay_idx = 0; + delay_base_idx = 0; + delay_base_time = current_ms; + for (size_t i = 0; i < CUR_DELAY_SIZE; i++) { + cur_delay_hist[i] = 0; + } + for (size_t i = 0; i < DELAY_BASE_HISTORY; i++) { + delay_base_hist[i] = 0; + } + } + + void shift(const uint32 offset) + { + // the offset should never be "negative" + // assert(offset < 0x10000000); + + // increase all of our base delays by this amount + // this is used to take clock skew into account + // by observing the other side's changes in its base_delay + for (size_t i = 0; i < DELAY_BASE_HISTORY; i++) { + delay_base_hist[i] += offset; + } + delay_base += offset; + } + + void add_sample(const uint32 sample, uint64 current_ms) + { + // The two clocks (in the two peers) are assumed not to + // progress at the exact same rate. They are assumed to be + // drifting, which causes the delay samples to contain + // a systematic error, either they are under- + // estimated or over-estimated. This is why we update the + // delay_base every two minutes, to adjust for this. + + // This means the values will keep drifting and eventually wrap. + // We can cross the wrapping boundry in two directions, either + // going up, crossing the highest value, or going down, crossing 0. + + // if the delay_base is close to the max value and sample actually + // wrapped on the other end we would see something like this: + // delay_base = 0xffffff00, sample = 0x00000400 + // sample - delay_base = 0x500 which is the correct difference + + // if the delay_base is instead close to 0, and we got an even lower + // sample (that will eventually update the delay_base), we may see + // something like this: + // delay_base = 0x00000400, sample = 0xffffff00 + // sample - delay_base = 0xfffffb00 + // this needs to be interpreted as a negative number and the actual + // recorded delay should be 0. + + // It is important that all arithmetic that assume wrapping + // is done with unsigned intergers. Signed integers are not guaranteed + // to wrap the way unsigned integers do. At least GCC takes advantage + // of this relaxed rule and won't necessarily wrap signed ints. + + // remove the clock offset and propagation delay. + // delay base is min of the sample and the current + // delay base. This min-operation is subject to wrapping + // and care needs to be taken to correctly choose the + // true minimum. + + // specifically the problem case is when delay_base is very small + // and sample is very large (because it wrapped past zero), sample + // needs to be considered the smaller + + if (!delay_base_initialized) { + // delay_base being 0 suggests that we haven't initialized + // it or its history with any real measurements yet. Initialize + // everything with this sample. + for (size_t i = 0; i < DELAY_BASE_HISTORY; i++) { + // if we don't have a value, set it to the current sample + delay_base_hist[i] = sample; + continue; + } + delay_base = sample; + delay_base_initialized = true; + } + + if (wrapping_compare_less(sample, delay_base_hist[delay_base_idx], TIMESTAMP_MASK)) { + // sample is smaller than the current delay_base_hist entry + // update it + delay_base_hist[delay_base_idx] = sample; + } + + // is sample lower than delay_base? If so, update delay_base + if (wrapping_compare_less(sample, delay_base, TIMESTAMP_MASK)) { + // sample is smaller than the current delay_base + // update it + delay_base = sample; + } + + // this operation may wrap, and is supposed to + const uint32 delay = sample - delay_base; + // sanity check. If this is triggered, something fishy is going on + // it means the measured sample was greater than 32 seconds! + //assert(delay < 0x2000000); + + cur_delay_hist[cur_delay_idx] = delay; + cur_delay_idx = (cur_delay_idx + 1) % CUR_DELAY_SIZE; + + // once every minute + if (current_ms - delay_base_time > 60 * 1000) { + delay_base_time = current_ms; + delay_base_idx = (delay_base_idx + 1) % DELAY_BASE_HISTORY; + // clear up the new delay base history spot by initializing + // it to the current sample, then update it + delay_base_hist[delay_base_idx] = sample; + delay_base = delay_base_hist[0]; + // Assign the lowest delay in the last 2 minutes to delay_base + for (size_t i = 0; i < DELAY_BASE_HISTORY; i++) { + if (wrapping_compare_less(delay_base_hist[i], delay_base, TIMESTAMP_MASK)) + delay_base = delay_base_hist[i]; + } + } + } + + uint32 get_value() + { + uint32 value = UINT_MAX; + for (size_t i = 0; i < CUR_DELAY_SIZE; i++) { + value = min(cur_delay_hist[i], value); + } + // value could be UINT_MAX if we have no samples yet... + return value; + } +}; + +struct UTPSocket { + ~UTPSocket(); + + PackedSockAddr addr; + utp_context *ctx; + + int ida; //for ack socket list + + uint16 retransmit_count; + + uint16 reorder_count; + byte duplicate_ack; + + // the number of packets in the send queue. Packets that haven't + // yet been sent count as well as packets marked as needing resend + // the oldest un-acked packet in the send queue is seq_nr - cur_window_packets + uint16 cur_window_packets; + + // how much of the window is used, number of bytes in-flight + // packets that have not yet been sent do not count, packets + // that are marked as needing to be re-sent (due to a timeout) + // don't count either + size_t cur_window; + // maximum window size, in bytes + size_t max_window; + // UTP_SNDBUF setting, in bytes + size_t opt_sndbuf; + // UTP_RCVBUF setting, in bytes + size_t opt_rcvbuf; + + // this is the target delay, in microseconds + // for this socket. defaults to 100000. + size_t target_delay; + + // Is a FIN packet in the reassembly buffer? + bool got_fin:1; + // Have we reached the FIN? + bool got_fin_reached:1; + + // Have we sent our FIN? + bool fin_sent:1; + // Has our fin been ACKed? + bool fin_sent_acked:1; + + // Reading is disabled + bool read_shutdown:1; + // User called utp_close() + bool close_requested:1; + + // Timeout procedure + bool fast_timeout:1; + + // max receive window for other end, in bytes + size_t max_window_user; + CONN_STATE state; + // TickCount when we last decayed window (wraps) + int64 last_rwin_decay; + + // the sequence number of the FIN packet. This field is only set + // when we have received a FIN, and the flag field has the FIN flag set. + // it is used to know when it is safe to destroy the socket, we must have + // received all packets up to this sequence number first. + uint16 eof_pkt; + + // All sequence numbers up to including this have been properly received + // by us + uint16 ack_nr; + // This is the sequence number for the next packet to be sent. + uint16 seq_nr; + + uint16 timeout_seq_nr; + + // This is the sequence number of the next packet we're allowed to + // do a fast resend with. This makes sure we only do a fast-resend + // once per packet. We can resend the packet with this sequence number + // or any later packet (with a higher sequence number). + uint16 fast_resend_seq_nr; + + uint32 reply_micro; + + uint64 last_got_packet; + uint64 last_sent_packet; + uint64 last_measured_delay; + + // timestamp of the last time the cwnd was full + // this is used to prevent the congestion window + // from growing when we're not sending at capacity + mutable uint64 last_maxed_out_window; + + void *userdata; + + // Round trip time + uint rtt; + // Round trip time variance + uint rtt_var; + // Round trip timeout + uint rto; + DelayHist rtt_hist; + uint retransmit_timeout; + // The RTO timer will timeout here. + uint64 rto_timeout; + // When the window size is set to zero, start this timer. It will send a new packet every 30secs. + uint64 zerowindow_time; + + uint32 conn_seed; + // Connection ID for packets I receive + uint32 conn_id_recv; + // Connection ID for packets I send + uint32 conn_id_send; + // Last rcv window we advertised, in bytes + size_t last_rcv_win; + + DelayHist our_hist; + DelayHist their_hist; + + // extension bytes from SYN packet + byte extensions[8]; + + // MTU Discovery + // time when we should restart the MTU discovery + uint64 mtu_discover_time; + // ceiling and floor of binary search. last is the mtu size + // we're currently using + uint32 mtu_ceiling, mtu_floor, mtu_last; + // we only ever have a single probe in flight at any given time. + // this is the sequence number of that probe, and the size of + // that packet + uint32 mtu_probe_seq, mtu_probe_size; + + // this is the average delay samples, as compared to the initial + // sample. It's averaged over 5 seconds + int32 average_delay; + // this is the sum of all the delay samples + // we've made recently. The important distinction + // of these samples is that they are all made compared + // to the initial sample, this is to deal with + // wrapping in a simple way. + int64 current_delay_sum; + // number of sample ins current_delay_sum + int current_delay_samples; + // initialized to 0, set to the first raw delay sample + // each sample that's added to current_delay_sum + // is subtracted from the value first, to make it + // a delay relative to this sample + uint32 average_delay_base; + // the next time we should add an average delay + // sample into average_delay_hist + uint64 average_sample_time; + // the estimated clock drift between our computer + // and the endpoint computer. The unit is microseconds + // per 5 seconds + int32 clock_drift; + // just used for logging + int32 clock_drift_raw; + + SizableCircularBuffer inbuf, outbuf; + + #ifdef _DEBUG + // Public per-socket statistics, returned by utp_get_stats() + utp_socket_stats _stats; + #endif + + // true if we're in slow-start (exponential growth) phase + bool slow_start; + + // the slow-start threshold, in bytes + size_t ssthresh; + + void log(int level, char const *fmt, ...) + { + va_list va; + char buf[4096], buf2[4096]; + + // don't bother with vsnprintf() etc calls if we're not going to log. + if (!ctx->would_log(level)) { + return; + } + + va_start(va, fmt); + vsnprintf(buf, 4096, fmt, va); + va_end(va); + buf[4095] = '\0'; + + snprintf(buf2, 4096, "%p %s %06u %s", this, addrfmt(addr, addrbuf), conn_id_recv, buf); + buf2[4095] = '\0'; + + ctx->log_unchecked(this, buf2); + } + + void schedule_ack(); + + // called every time mtu_floor or mtu_ceiling are adjusted + void mtu_search_update(); + void mtu_reset(); + + // Calculates the current receive window + size_t get_rcv_window() + { + // Trim window down according to what's already in buffer. + const size_t numbuf = utp_call_get_read_buffer_size(this->ctx, this); + assert((int)numbuf >= 0); + return opt_rcvbuf > numbuf ? opt_rcvbuf - numbuf : 0; + } + + // Test if we're ready to decay max_window + // XXX this breaks when spaced by > INT_MAX/2, which is 49 + // days; the failure mode in that case is we do an extra decay + // or fail to do one when we really shouldn't. + bool can_decay_win(int64 msec) const + { + return (msec - last_rwin_decay) >= MAX_WINDOW_DECAY; + } + + // If we can, decay max window, returns true if we actually did so + void maybe_decay_win(uint64 current_ms) + { + if (can_decay_win(current_ms)) { + // TCP uses 0.5 + max_window = (size_t)(max_window * .5); + last_rwin_decay = current_ms; + if (max_window < MIN_WINDOW_SIZE) + max_window = MIN_WINDOW_SIZE; + slow_start = false; + ssthresh = max_window; + } + } + + size_t get_header_size() const + { + return sizeof(PacketFormatV1); + } + + size_t get_udp_mtu() + { + socklen_t len; + SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&len); + return utp_call_get_udp_mtu(this->ctx, this, (const struct sockaddr *)&sa, len); + } + + size_t get_udp_overhead() + { + socklen_t len; + SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&len); + return utp_call_get_udp_overhead(this->ctx, this, (const struct sockaddr *)&sa, len); + } + + size_t get_overhead() + { + return get_udp_overhead() + get_header_size(); + } + + void send_data(byte* b, size_t length, bandwidth_type_t type, uint32 flags = 0); + + void send_ack(bool synack = false); + + void send_keep_alive(); + + static void send_rst(utp_context *ctx, + const PackedSockAddr &addr, uint32 conn_id_send, + uint16 ack_nr, uint16 seq_nr); + + void send_packet(OutgoingPacket *pkt); + + bool is_full(int bytes = -1); + bool flush_packets(); + void write_outgoing_packet(size_t payload, uint flags, struct utp_iovec *iovec, size_t num_iovecs); + + #ifdef _DEBUG + void check_invariant(); + #endif + + void check_timeouts(); + int ack_packet(uint16 seq); + size_t selective_ack_bytes(uint base, const byte* mask, byte len, int64& min_rtt); + void selective_ack(uint base, const byte *mask, byte len); + void apply_ccontrol(size_t bytes_acked, uint32 actual_delay, int64 min_rtt); + size_t get_packet_size() const; +}; + +void removeSocketFromAckList(UTPSocket *conn) +{ + if (conn->ida >= 0) + { + UTPSocket *last = conn->ctx->ack_sockets[conn->ctx->ack_sockets.GetCount() - 1]; + + assert(last->ida < (int)(conn->ctx->ack_sockets.GetCount())); + assert(conn->ctx->ack_sockets[last->ida] == last); + last->ida = conn->ida; + conn->ctx->ack_sockets[conn->ida] = last; + conn->ida = -1; + + // Decrease the count + conn->ctx->ack_sockets.SetCount(conn->ctx->ack_sockets.GetCount() - 1); + } +} + +static void utp_register_sent_packet(utp_context *ctx, size_t length) +{ + if (length <= PACKET_SIZE_MID) { + if (length <= PACKET_SIZE_EMPTY) { + ctx->context_stats._nraw_send[PACKET_SIZE_EMPTY_BUCKET]++; + } else if (length <= PACKET_SIZE_SMALL) { + ctx->context_stats._nraw_send[PACKET_SIZE_SMALL_BUCKET]++; + } else + ctx->context_stats._nraw_send[PACKET_SIZE_MID_BUCKET]++; + } else { + if (length <= PACKET_SIZE_BIG) { + ctx->context_stats._nraw_send[PACKET_SIZE_BIG_BUCKET]++; + } else + ctx->context_stats._nraw_send[PACKET_SIZE_HUGE_BUCKET]++; + } +} + +void send_to_addr(utp_context *ctx, const byte *p, size_t len, const PackedSockAddr &addr, int flags = 0) +{ + socklen_t tolen; + SOCKADDR_STORAGE to = addr.get_sockaddr_storage(&tolen); + utp_register_sent_packet(ctx, len); + utp_call_sendto(ctx, NULL, p, len, (const struct sockaddr *)&to, tolen, flags); +} + +void UTPSocket::schedule_ack() +{ + if (ida == -1){ + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "schedule_ack"); + #endif + ida = ctx->ack_sockets.Append(this); + } else { + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "schedule_ack: already in list"); + #endif + } +} + +void UTPSocket::send_data(byte* b, size_t length, bandwidth_type_t type, uint32 flags) +{ + // time stamp this packet with local time, the stamp goes into + // the header of every packet at the 8th byte for 8 bytes : + // two integers, check packet.h for more + uint64 time = utp_call_get_microseconds(ctx, this); + + PacketFormatV1* b1 = (PacketFormatV1*)b; + b1->tv_usec = (uint32)time; + b1->reply_micro = reply_micro; + + last_sent_packet = ctx->current_ms; + + #ifdef _DEBUG + _stats.nbytes_xmit += length; + ++_stats.nxmit; + #endif + + if (ctx->callbacks[UTP_ON_OVERHEAD_STATISTICS]) { + size_t n; + if (type == payload_bandwidth) { + // if this packet carries payload, just + // count the header as overhead + type = header_overhead; + n = get_overhead(); + } else { + n = length + get_udp_overhead(); + } + utp_call_on_overhead_statistics(ctx, this, true, n, type); + } +#if UTP_DEBUG_LOGGING + int flags2 = b1->type(); + uint16 seq_nr = b1->seq_nr; + uint16 ack_nr = b1->ack_nr; + log(UTP_LOG_DEBUG, "send %s len:%u id:%u timestamp:" I64u " reply_micro:%u flags:%s seq_nr:%u ack_nr:%u", + addrfmt(addr, addrbuf), (uint)length, conn_id_send, time, reply_micro, flagnames[flags2], + seq_nr, ack_nr); +#endif + send_to_addr(ctx, b, length, addr, flags); + removeSocketFromAckList(this); +} + +void UTPSocket::send_ack(bool synack) +{ + PacketFormatAckV1 pfa; + zeromem(&pfa); + + size_t len; + last_rcv_win = get_rcv_window(); + pfa.pf.set_version(1); + pfa.pf.set_type(ST_STATE); + pfa.pf.ext = 0; + pfa.pf.connid = conn_id_send; + pfa.pf.ack_nr = ack_nr; + pfa.pf.seq_nr = seq_nr; + pfa.pf.windowsize = (uint32)last_rcv_win; + len = sizeof(PacketFormatV1); + + // we never need to send EACK for connections + // that are shutting down + if (reorder_count != 0 && !got_fin_reached) { + // if reorder count > 0, send an EACK. + // reorder count should always be 0 + // for synacks, so this should not be + // as synack + assert(!synack); + pfa.pf.ext = 1; + pfa.ext_next = 0; + pfa.ext_len = 4; + uint m = 0; + + // reorder count should only be non-zero + // if the packet ack_nr + 1 has not yet + // been received + assert(inbuf.get(ack_nr + 1) == NULL); + size_t window = min(14+16, inbuf.size()); + // Generate bit mask of segments received. + for (size_t i = 0; i < window; i++) { + if (inbuf.get(ack_nr + i + 2) != NULL) { + m |= 1 << i; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "EACK packet [%u]", ack_nr + i + 2); + #endif + } + } + pfa.acks[0] = (byte)m; + pfa.acks[1] = (byte)(m >> 8); + pfa.acks[2] = (byte)(m >> 16); + pfa.acks[3] = (byte)(m >> 24); + len += 4 + 2; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "Sending EACK %u [%u] bits:[%032b]", ack_nr, conn_id_send, m); + #endif + } else { + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "Sending ACK %u [%u]", ack_nr, conn_id_send); + #endif + } + + send_data((byte*)&pfa, len, ack_overhead); + removeSocketFromAckList(this); +} + +void UTPSocket::send_keep_alive() +{ + ack_nr--; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "Sending KeepAlive ACK %u [%u]", ack_nr, conn_id_send); + #endif + + send_ack(); + ack_nr++; +} + +void UTPSocket::send_rst(utp_context *ctx, + const PackedSockAddr &addr, uint32 conn_id_send, uint16 ack_nr, uint16 seq_nr) +{ + PacketFormatV1 pf1; + zeromem(&pf1); + + size_t len; + pf1.set_version(1); + pf1.set_type(ST_RESET); + pf1.ext = 0; + pf1.connid = conn_id_send; + pf1.ack_nr = ack_nr; + pf1.seq_nr = seq_nr; + pf1.windowsize = 0; + len = sizeof(PacketFormatV1); + +// LOG_DEBUG("%s: Sending RST id:%u seq_nr:%u ack_nr:%u", addrfmt(addr, addrbuf), conn_id_send, seq_nr, ack_nr); +// LOG_DEBUG("send %s len:%u id:%u", addrfmt(addr, addrbuf), (uint)len, conn_id_send); + send_to_addr(ctx, (const byte*)&pf1, len, addr); +} + +void UTPSocket::send_packet(OutgoingPacket *pkt) +{ + // only count against the quota the first time we + // send the packet. Don't enforce quota when closing + // a socket. Only enforce the quota when we're sending + // at slow rates (max window < packet size) + + //size_t max_send = min(max_window, opt_sndbuf, max_window_user); + time_t cur_time = utp_call_get_milliseconds(this->ctx, this); + + if (pkt->transmissions == 0 || pkt->need_resend) { + cur_window += pkt->payload; + } + + pkt->need_resend = false; + + PacketFormatV1* p1 = (PacketFormatV1*)pkt->data; + p1->ack_nr = ack_nr; + pkt->time_sent = utp_call_get_microseconds(this->ctx, this); + + //socklen_t salen; + //SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&salen); + bool use_as_mtu_probe = false; + + // TODO: this is subject to nasty wrapping issues! Below as well + if (mtu_discover_time < (uint64)cur_time) { + // it's time to reset our MTU assupmtions + // and trigger a new search + mtu_reset(); + } + + // don't use packets that are larger then mtu_ceiling + // as probes, since they were probably used as probes + // already and failed, now we need it to fragment + // just to get it through + // if seq_nr == 1, the probe would end up being 0 + // which is a magic number representing no-probe + // that why we don't send a probe for a packet with + // sequence number 0 + if (mtu_floor < mtu_ceiling + && pkt->length > mtu_floor + && pkt->length <= mtu_ceiling + && mtu_probe_seq == 0 + && seq_nr != 1 + && pkt->transmissions == 0) { + + // we've already incremented seq_nr + // for this packet + mtu_probe_seq = (seq_nr - 1) & ACK_NR_MASK; + mtu_probe_size = pkt->length; + assert(pkt->length >= mtu_floor); + assert(pkt->length <= mtu_ceiling); + use_as_mtu_probe = true; + log(UTP_LOG_MTU, "MTU [PROBE] floor:%d ceiling:%d current:%d" + , mtu_floor, mtu_ceiling, mtu_probe_size); + } + + pkt->transmissions++; + send_data((byte*)pkt->data, pkt->length, + (state == CS_SYN_SENT) ? connect_overhead + : (pkt->transmissions == 1) ? payload_bandwidth + : retransmit_overhead, use_as_mtu_probe ? UTP_UDP_DONTFRAG : 0); +} + +bool UTPSocket::is_full(int bytes) +{ + size_t packet_size = get_packet_size(); + if (bytes < 0) bytes = packet_size; + else if (bytes > (int)packet_size) bytes = (int)packet_size; + size_t max_send = min(max_window, opt_sndbuf, max_window_user); + + // subtract one to save space for the FIN packet + if (cur_window_packets >= OUTGOING_BUFFER_MAX_SIZE - 1) { + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "is_full:false cur_window_packets:%d MAX:%d", cur_window_packets, OUTGOING_BUFFER_MAX_SIZE - 1); + #endif + + last_maxed_out_window = ctx->current_ms; + return true; + } + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "is_full:%s. cur_window:%u pkt:%u max:%u cur_window_packets:%u max_window:%u" + , (cur_window + bytes > max_send) ? "true" : "false" + , cur_window, bytes, max_send, cur_window_packets + , max_window); + #endif + + if (cur_window + bytes > max_send) { + last_maxed_out_window = ctx->current_ms; + return true; + } + return false; +} + +bool UTPSocket::flush_packets() +{ + size_t packet_size = get_packet_size(); + + // send packets that are waiting on the pacer to be sent + // i has to be an unsigned 16 bit counter to wrap correctly + // signed types are not guaranteed to wrap the way you expect + for (uint16 i = seq_nr - cur_window_packets; i != seq_nr; ++i) { + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(i); + if (pkt == 0 || (pkt->transmissions > 0 && pkt->need_resend == false)) continue; + // have we run out of quota? + if (is_full()) return true; + + // Nagle check + // don't send the last packet if we have one packet in-flight + // and the current packet is still smaller than packet_size. + if (i != ((seq_nr - 1) & ACK_NR_MASK) || + cur_window_packets == 1 || + pkt->payload >= packet_size) { + send_packet(pkt); + } + } + return false; +} + +// @payload: number of bytes to send +// @flags: either ST_DATA, or ST_FIN +// @iovec: base address of iovec array +// @num_iovecs: number of iovecs in array +void UTPSocket::write_outgoing_packet(size_t payload, uint flags, struct utp_iovec *iovec, size_t num_iovecs) +{ + // Setup initial timeout timer + if (cur_window_packets == 0) { + retransmit_timeout = rto; + rto_timeout = ctx->current_ms + retransmit_timeout; + assert(cur_window == 0); + } + + size_t packet_size = get_packet_size(); + do { + assert(cur_window_packets < OUTGOING_BUFFER_MAX_SIZE); + assert(flags == ST_DATA || flags == ST_FIN); + + size_t added = 0; + + OutgoingPacket *pkt = NULL; + + if (cur_window_packets > 0) { + pkt = (OutgoingPacket*)outbuf.get(seq_nr - 1); + } + + const size_t header_size = get_header_size(); + bool append = true; + + // if there's any room left in the last packet in the window + // and it hasn't been sent yet, fill that frame first + if (payload && pkt && !pkt->transmissions && pkt->payload < packet_size) { + // Use the previous unsent packet + added = min(payload + pkt->payload, max(packet_size, pkt->payload)) - pkt->payload; + pkt = (OutgoingPacket*)realloc(pkt, + (sizeof(OutgoingPacket) - 1) + + header_size + + pkt->payload + added); + outbuf.put(seq_nr - 1, pkt); + append = false; + assert(!pkt->need_resend); + } else { + // Create the packet to send. + added = payload; + pkt = (OutgoingPacket*)malloc((sizeof(OutgoingPacket) - 1) + + header_size + + added); + pkt->payload = 0; + pkt->transmissions = 0; + pkt->need_resend = false; + } + + if (added) { + assert(flags == ST_DATA); + + // Fill it with data from the upper layer. + unsigned char *p = pkt->data + header_size + pkt->payload; + size_t needed = added; + + /* + while (needed) { + *p = *(char*)iovec[0].iov_base; + p++; + iovec[0].iov_base = (char *)iovec[0].iov_base + 1; + needed--; + } + */ + + for (size_t i = 0; i < num_iovecs && needed; i++) { + if (iovec[i].iov_len == 0) + continue; + + size_t num = min(needed, iovec[i].iov_len); + memcpy(p, iovec[i].iov_base, num); + + p += num; + + iovec[i].iov_len -= num; + iovec[i].iov_base = (byte*)iovec[i].iov_base + num; // iovec[i].iov_base += num, but without void* pointers + needed -= num; + } + + assert(needed == 0); + } + pkt->payload += added; + pkt->length = header_size + pkt->payload; + + last_rcv_win = get_rcv_window(); + + PacketFormatV1* p1 = (PacketFormatV1*)pkt->data; + p1->set_version(1); + p1->set_type(flags); + p1->ext = 0; + p1->connid = conn_id_send; + p1->windowsize = (uint32)last_rcv_win; + p1->ack_nr = ack_nr; + + if (append) { + // Remember the message in the outgoing queue. + outbuf.ensure_size(seq_nr, cur_window_packets); + outbuf.put(seq_nr, pkt); + p1->seq_nr = seq_nr; + seq_nr++; + cur_window_packets++; + } + + payload -= added; + + } while (payload); + + flush_packets(); +} + +#ifdef _DEBUG +void UTPSocket::check_invariant() +{ + if (reorder_count > 0) { + assert(inbuf.get(ack_nr + 1) == NULL); + } + + size_t outstanding_bytes = 0; + for (int i = 0; i < cur_window_packets; ++i) { + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq_nr - i - 1); + if (pkt == 0 || pkt->transmissions == 0 || pkt->need_resend) continue; + outstanding_bytes += pkt->payload; + } + assert(outstanding_bytes == cur_window); +} +#endif + +void UTPSocket::check_timeouts() +{ + #ifdef _DEBUG + check_invariant(); + #endif + + // this invariant should always be true + assert(cur_window_packets == 0 || outbuf.get(seq_nr - cur_window_packets)); + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "CheckTimeouts timeout:%d max_window:%u cur_window:%u " + "state:%s cur_window_packets:%u", + (int)(rto_timeout - ctx->current_ms), (uint)max_window, (uint)cur_window, + statenames[state], cur_window_packets); + #endif + + if (state != CS_DESTROY) flush_packets(); + + switch (state) { + case CS_SYN_SENT: + case CS_SYN_RECV: + case CS_CONNECTED_FULL: + case CS_CONNECTED: { + + // Reset max window... + if ((int)(ctx->current_ms - zerowindow_time) >= 0 && max_window_user == 0) { + max_window_user = PACKET_SIZE; + } + + if ((int)(ctx->current_ms - rto_timeout) >= 0 + && rto_timeout > 0) { + + bool ignore_loss = false; + + if (cur_window_packets == 1 + && ((seq_nr - 1) & ACK_NR_MASK) == mtu_probe_seq + && mtu_probe_seq != 0) { + // we only had a single outstanding packet that timed out, and it was the probe + mtu_ceiling = mtu_probe_size - 1; + mtu_search_update(); + // this packet was most likely dropped because the packet size being + // too big and not because congestion. To accelerate the binary search for + // the MTU, resend immediately and don't reset the window size + ignore_loss = true; + log(UTP_LOG_MTU, "MTU [PROBE-TIMEOUT] floor:%d ceiling:%d current:%d" + , mtu_floor, mtu_ceiling, mtu_last); + } + // we dropepd the probe, clear these fields to + // allow us to send a new one + mtu_probe_seq = mtu_probe_size = 0; + log(UTP_LOG_MTU, "MTU [TIMEOUT]"); + + /* + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq_nr - cur_window_packets); + + // If there were a lot of retransmissions, force recomputation of round trip time + if (pkt->transmissions >= 4) + rtt = 0; + */ + + // Increase RTO + const uint new_timeout = ignore_loss ? retransmit_timeout : retransmit_timeout * 2; + + // They initiated the connection but failed to respond before the rto. + // A malicious client can also spoof the destination address of a ST_SYN bringing us to this state. + // Kill the connection and do not notify the upper layer + if (state == CS_SYN_RECV) { + state = CS_DESTROY; + utp_call_on_error(ctx, this, UTP_ETIMEDOUT); + return; + } + + // We initiated the connection but the other side failed to respond before the rto + if (retransmit_count >= 4 || (state == CS_SYN_SENT && retransmit_count >= 2)) { + // 4 consecutive transmissions have timed out. Kill it. If we + // haven't even connected yet, give up after only 2 consecutive + // failed transmissions. + if (close_requested) + state = CS_DESTROY; + else + state = CS_RESET; + utp_call_on_error(ctx, this, UTP_ETIMEDOUT); + return; + } + + retransmit_timeout = new_timeout; + rto_timeout = ctx->current_ms + new_timeout; + + if (!ignore_loss) { + // On Timeout + duplicate_ack = 0; + + int packet_size = get_packet_size(); + + if ((cur_window_packets == 0) && ((int)max_window > packet_size)) { + // we don't have any packets in-flight, even though + // we could. This implies that the connection is just + // idling. No need to be aggressive about resetting the + // congestion window. Just let it decay by a 3:rd. + // don't set it any lower than the packet size though + max_window = max(max_window * 2 / 3, size_t(packet_size)); + } else { + // our delay was so high that our congestion window + // was shrunk below one packet, preventing us from + // sending anything for one time-out period. Now, reset + // the congestion window to fit one packet, to start over + // again + max_window = packet_size; + slow_start = true; + } + } + + // every packet should be considered lost + for (int i = 0; i < cur_window_packets; ++i) { + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq_nr - i - 1); + if (pkt == 0 || pkt->transmissions == 0 || pkt->need_resend) continue; + pkt->need_resend = true; + assert(cur_window >= pkt->payload); + cur_window -= pkt->payload; + } + + if (cur_window_packets > 0) { + retransmit_count++; + // used in parse_log.py + log(UTP_LOG_NORMAL, "Packet timeout. Resend. seq_nr:%u. timeout:%u " + "max_window:%u cur_window_packets:%d" + , seq_nr - cur_window_packets, retransmit_timeout + , (uint)max_window, int(cur_window_packets)); + + fast_timeout = true; + timeout_seq_nr = seq_nr; + + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq_nr - cur_window_packets); + assert(pkt); + + // Re-send the packet. + send_packet(pkt); + } + } + + // Mark the socket as writable. If the cwnd has grown, or if the number of + // bytes in-flight is lower than cwnd, we need to make the socket writable again + // in case it isn't + if (state == CS_CONNECTED_FULL && !is_full()) { + state = CS_CONNECTED; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "Socket writable. max_window:%u cur_window:%u packet_size:%u", + (uint)max_window, (uint)cur_window, (uint)get_packet_size()); + #endif + utp_call_on_state_change(this->ctx, this, UTP_STATE_WRITABLE); + } + + if (state >= CS_CONNECTED && !fin_sent) { + if ((int)(ctx->current_ms - last_sent_packet) >= KEEPALIVE_INTERVAL) { + send_keep_alive(); + } + } + break; + } + + // prevent warning + case CS_UNINITIALIZED: + case CS_IDLE: + case CS_RESET: + case CS_DESTROY: + break; + } +} + +// this should be called every time we change mtu_floor or mtu_ceiling +void UTPSocket::mtu_search_update() +{ + assert(mtu_floor <= mtu_ceiling); + + // binary search + mtu_last = (mtu_floor + mtu_ceiling) / 2; + + // enable a new probe to be sent + mtu_probe_seq = mtu_probe_size = 0; + + // if the floor and ceiling are close enough, consider the + // MTU binary search complete. We set the current value + // to floor since that's the only size we know can go through + // also set the ceiling to floor to terminate the searching + if (mtu_ceiling - mtu_floor <= 16) { + mtu_last = mtu_floor; + log(UTP_LOG_MTU, "MTU [DONE] floor:%d ceiling:%d current:%d" + , mtu_floor, mtu_ceiling, mtu_last); + mtu_ceiling = mtu_floor; + assert(mtu_floor <= mtu_ceiling); + // Do another search in 30 minutes + mtu_discover_time = utp_call_get_milliseconds(this->ctx, this) + 30 * 60 * 1000; + } +} + +void UTPSocket::mtu_reset() +{ + mtu_ceiling = get_udp_mtu(); + // Less would not pass TCP... + mtu_floor = 576; + log(UTP_LOG_MTU, "MTU [RESET] floor:%d ceiling:%d current:%d" + , mtu_floor, mtu_ceiling, mtu_last); + assert(mtu_floor <= mtu_ceiling); + mtu_discover_time = utp_call_get_milliseconds(this->ctx, this) + 30 * 60 * 1000; +} + +// returns: +// 0: the packet was acked. +// 1: it means that the packet had already been acked +// 2: the packet has not been sent yet +int UTPSocket::ack_packet(uint16 seq) +{ + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq); + + // the packet has already been acked (or not sent) + if (pkt == NULL) { + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "got ack for:%u (already acked, or never sent)", seq); + #endif + + return 1; + } + + // can't ack packets that haven't been sent yet! + if (pkt->transmissions == 0) { + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "got ack for:%u (never sent, pkt_size:%u need_resend:%u)", + seq, (uint)pkt->payload, pkt->need_resend); + #endif + + return 2; + } + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "got ack for:%u (pkt_size:%u need_resend:%u)", + seq, (uint)pkt->payload, pkt->need_resend); + #endif + + outbuf.put(seq, NULL); + + // if we never re-sent the packet, update the RTT estimate + if (pkt->transmissions == 1) { + // Estimate the round trip time. + const uint32 ertt = (uint32)((utp_call_get_microseconds(this->ctx, this) - pkt->time_sent) / 1000); + if (rtt == 0) { + // First round trip time sample + rtt = ertt; + rtt_var = ertt / 2; + // sanity check. rtt should never be more than 6 seconds +// assert(rtt < 6000); + } else { + // Compute new round trip times + const int delta = (int)rtt - ertt; + rtt_var = rtt_var + (int)(abs(delta) - rtt_var) / 4; + rtt = rtt - rtt/8 + ertt/8; + // sanity check. rtt should never be more than 6 seconds +// assert(rtt < 6000); + rtt_hist.add_sample(ertt, ctx->current_ms); + } + rto = max(rtt + rtt_var * 4, 1000); + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "rtt:%u avg:%u var:%u rto:%u", + ertt, rtt, rtt_var, rto); + #endif + + } + retransmit_timeout = rto; + rto_timeout = ctx->current_ms + rto; + // if need_resend is set, this packet has already + // been considered timed-out, and is not included in + // the cur_window anymore + if (!pkt->need_resend) { + assert(cur_window >= pkt->payload); + cur_window -= pkt->payload; + } + free(pkt); + retransmit_count = 0; + return 0; +} + +// count the number of bytes that were acked by the EACK header +size_t UTPSocket::selective_ack_bytes(uint base, const byte* mask, byte len, int64& min_rtt) +{ + if (cur_window_packets == 0) return 0; + + size_t acked_bytes = 0; + int bits = len * 8; + uint64 now = utp_call_get_microseconds(this->ctx, this); + + do { + uint v = base + bits; + + // ignore bits that haven't been sent yet + // see comment in UTPSocket::selective_ack + if (((seq_nr - v - 1) & ACK_NR_MASK) >= (uint16)(cur_window_packets - 1)) + continue; + + // ignore bits that represents packets we haven't sent yet + // or packets that have already been acked + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(v); + if (!pkt || pkt->transmissions == 0) + continue; + + // Count the number of segments that were successfully received past it. + if (bits >= 0 && mask[bits>>3] & (1 << (bits & 7))) { + assert((int)(pkt->payload) >= 0); + acked_bytes += pkt->payload; + if (pkt->time_sent < now) + min_rtt = min(min_rtt, now - pkt->time_sent); + else + min_rtt = min(min_rtt, 50000); + continue; + } + } while (--bits >= -1); + return acked_bytes; +} + +enum { MAX_EACK = 128 }; + +void UTPSocket::selective_ack(uint base, const byte *mask, byte len) +{ + if (cur_window_packets == 0) return; + + // the range is inclusive [0, 31] bits + int bits = len * 8 - 1; + + int count = 0; + + // resends is a stack of sequence numbers we need to resend. Since we + // iterate in reverse over the acked packets, at the end, the top packets + // are the ones we want to resend + int resends[MAX_EACK]; + int nr = 0; + +#if UTP_DEBUG_LOGGING + char bitmask[1024] = {0}; + int counter = bits; + for (int i = 0; i <= bits; ++i) { + bool bit_set = counter >= 0 && mask[counter>>3] & (1 << (counter & 7)); + bitmask[i] = bit_set ? '1' : '0'; + --counter; + } + + log(UTP_LOG_DEBUG, "Got EACK [%s] base:%u", bitmask, base); +#endif + + do { + // we're iterating over the bits from higher sequence numbers + // to lower (kind of in reverse order, wich might not be very + // intuitive) + uint v = base + bits; + + // ignore bits that haven't been sent yet + // and bits that fall below the ACKed sequence number + // this can happen if an EACK message gets + // reordered and arrives after a packet that ACKs up past + // the base for thie EACK message + + // this is essentially the same as: + // if v >= seq_nr || v <= seq_nr - cur_window_packets + // but it takes wrapping into account + + // if v == seq_nr the -1 will make it wrap. if v > seq_nr + // it will also wrap (since it will fall further below 0) + // and be > cur_window_packets. + // if v == seq_nr - cur_window_packets, the result will be + // seq_nr - (seq_nr - cur_window_packets) - 1 + // == seq_nr - seq_nr + cur_window_packets - 1 + // == cur_window_packets - 1 which will be caught by the + // test. If v < seq_nr - cur_window_packets the result will grow + // fall furhter outside of the cur_window_packets range. + + // sequence number space: + // + // rejected < accepted > rejected + // <============+--------------+============> + // ^ ^ + // | | + // (seq_nr-wnd) seq_nr + + if (((seq_nr - v - 1) & ACK_NR_MASK) >= (uint16)(cur_window_packets - 1)) + continue; + + // this counts as a duplicate ack, even though we might have + // received an ack for this packet previously (in another EACK + // message for instance) + bool bit_set = bits >= 0 && mask[bits>>3] & (1 << (bits & 7)); + + // if this packet is acked, it counts towards the duplicate ack counter + if (bit_set) count++; + + // ignore bits that represents packets we haven't sent yet + // or packets that have already been acked + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(v); + if (!pkt || pkt->transmissions == 0) { + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "skipping %u. pkt:%08x transmissions:%u %s", + v, pkt, pkt?pkt->transmissions:0, pkt?"(not sent yet?)":"(already acked?)"); + #endif + continue; + } + + // Count the number of segments that were successfully received past it. + if (bit_set) { + // the selective ack should never ACK the packet we're waiting for to decrement cur_window_packets + assert((v & outbuf.mask) != ((seq_nr - cur_window_packets) & outbuf.mask)); + ack_packet(v); + continue; + } + + // Resend segments + // if count is less than our re-send limit, we haven't seen enough + // acked packets in front of this one to warrant a re-send. + // if count == 0, we're still going through the tail of zeroes + if (((v - fast_resend_seq_nr) & ACK_NR_MASK) <= OUTGOING_BUFFER_MAX_SIZE && + count >= DUPLICATE_ACKS_BEFORE_RESEND) { + // resends is a stack, and we're mostly interested in the top of it + // if we're full, just throw away the lower half + if (nr >= MAX_EACK - 2) { + memmove(resends, &resends[MAX_EACK/2], MAX_EACK/2 * sizeof(resends[0])); + nr -= MAX_EACK / 2; + } + resends[nr++] = v; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "no ack for %u", v); + #endif + + } else { + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "not resending %u count:%d dup_ack:%u fast_resend_seq_nr:%u", + v, count, duplicate_ack, fast_resend_seq_nr); + #endif + } + } while (--bits >= -1); + + if (((base - 1 - fast_resend_seq_nr) & ACK_NR_MASK) <= OUTGOING_BUFFER_MAX_SIZE && + count >= DUPLICATE_ACKS_BEFORE_RESEND) { + // if we get enough duplicate acks to start + // resending, the first packet we should resend + // is base-1 + resends[nr++] = (base - 1) & ACK_NR_MASK; + + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "no ack for %u", (base - 1) & ACK_NR_MASK); + #endif + + } else { + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "not resending %u count:%d dup_ack:%u fast_resend_seq_nr:%u", + base - 1, count, duplicate_ack, fast_resend_seq_nr); + #endif + } + + bool back_off = false; + int i = 0; + while (nr > 0) { + uint v = resends[--nr]; + // don't consider the tail of 0:es to be lost packets + // only unacked packets with acked packets after should + // be considered lost + OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(v); + + // this may be an old (re-ordered) packet, and some of the + // packets in here may have been acked already. In which + // case they will not be in the send queue anymore + if (!pkt) continue; + + // used in parse_log.py + log(UTP_LOG_NORMAL, "Packet %u lost. Resending", v); + + // On Loss + back_off = true; + + #ifdef _DEBUG + ++_stats.rexmit; + #endif + + send_packet(pkt); + fast_resend_seq_nr = (v + 1) & ACK_NR_MASK; + + // Re-send max 4 packets. + if (++i >= 4) break; + } + + if (back_off) + maybe_decay_win(ctx->current_ms); + + duplicate_ack = count; +} + +void UTPSocket::apply_ccontrol(size_t bytes_acked, uint32 actual_delay, int64 min_rtt) +{ + // the delay can never be greater than the rtt. The min_rtt + // variable is the RTT in microseconds + + assert(min_rtt >= 0); + int32 our_delay = min(our_hist.get_value(), uint32(min_rtt)); + assert(our_delay != INT_MAX); + assert(our_delay >= 0); + + utp_call_on_delay_sample(this->ctx, this, our_delay / 1000); + + // This test the connection under heavy load from foreground + // traffic. Pretend that our delays are very high to force the + // connection to use sub-packet size window sizes + //our_delay *= 4; + + // target is microseconds + int target = target_delay; + if (target <= 0) target = 100000; + + // this is here to compensate for very large clock drift that affects + // the congestion controller into giving certain endpoints an unfair + // share of the bandwidth. We have an estimate of the clock drift + // (clock_drift). The unit of this is microseconds per 5 seconds. + // empirically, a reasonable cut-off appears to be about 200000 + // (which is pretty high). The main purpose is to compensate for + // people trying to "cheat" uTP by making their clock run slower, + // and this definitely catches that without any risk of false positives + // if clock_drift < -200000 start applying a penalty delay proportional + // to how far beoynd -200000 the clock drift is + int32 penalty = 0; + if (clock_drift < -200000) { + penalty = (-clock_drift - 200000) / 7; + our_delay += penalty; + } + + double off_target = target - our_delay; + + // this is the same as: + // + // (min(off_target, target) / target) * (bytes_acked / max_window) * MAX_CWND_INCREASE_BYTES_PER_RTT + // + // so, it's scaling the max increase by the fraction of the window this ack represents, and the fraction + // of the target delay the current delay represents. + // The min() around off_target protects against crazy values of our_delay, which may happen when th + // timestamps wraps, or by just having a malicious peer sending garbage. This caps the increase + // of the window size to MAX_CWND_INCREASE_BYTES_PER_RTT per rtt. + // as for large negative numbers, this direction is already capped at the min packet size further down + // the min around the bytes_acked protects against the case where the window size was recently + // shrunk and the number of acked bytes exceeds that. This is considered no more than one full + // window, in order to keep the gain within sane boundries. + + assert(bytes_acked > 0); + double window_factor = (double)min(bytes_acked, max_window) / (double)max(max_window, bytes_acked); + + double delay_factor = off_target / target; + double scaled_gain = MAX_CWND_INCREASE_BYTES_PER_RTT * window_factor * delay_factor; + + // since MAX_CWND_INCREASE_BYTES_PER_RTT is a cap on how much the window size (max_window) + // may increase per RTT, we may not increase the window size more than that proportional + // to the number of bytes that were acked, so that once one window has been acked (one rtt) + // the increase limit is not exceeded + // the +1. is to allow for floating point imprecision + assert(scaled_gain <= 1. + MAX_CWND_INCREASE_BYTES_PER_RTT * (double)min(bytes_acked, max_window) / (double)max(max_window, bytes_acked)); + + if (scaled_gain > 0 && ctx->current_ms - last_maxed_out_window > 1000) { + // if it was more than 1 second since we tried to send a packet + // and stopped because we hit the max window, we're most likely rate + // limited (which prevents us from ever hitting the window size) + // if this is the case, we cannot let the max_window grow indefinitely + scaled_gain = 0; + } + + size_t ledbat_cwnd = (max_window + scaled_gain < MIN_WINDOW_SIZE) ? MIN_WINDOW_SIZE : (size_t)(max_window + scaled_gain); + + if (slow_start) { + size_t ss_cwnd = (size_t)(max_window + window_factor*get_packet_size()); + if (ss_cwnd > ssthresh) { + slow_start = false; + } else if (our_delay > target*0.9) { + // even if we're a little under the target delay, we conservatively + // discontinue the slow start phase + slow_start = false; + ssthresh = max_window; + } else { + max_window = max(ss_cwnd, ledbat_cwnd); + } + } else { + max_window = ledbat_cwnd; + } + + + // make sure that the congestion window is below max + // make sure that we don't shrink our window too small + max_window = clamp(max_window, MIN_WINDOW_SIZE, opt_sndbuf); + + // used in parse_log.py + log(UTP_LOG_NORMAL, "actual_delay:%u our_delay:%d their_delay:%u off_target:%d max_window:%u " + "delay_base:%u delay_sum:%d target_delay:%d acked_bytes:%u cur_window:%u " + "scaled_gain:%f rtt:%u rate:%u wnduser:%u rto:%u timeout:%d get_microseconds:" I64u " " + "cur_window_packets:%u packet_size:%u their_delay_base:%u their_actual_delay:%u " + "average_delay:%d clock_drift:%d clock_drift_raw:%d delay_penalty:%d current_delay_sum:" I64u + "current_delay_samples:%d average_delay_base:%d last_maxed_out_window:" I64u " opt_sndbuf:%d " + "current_ms:" I64u "", + actual_delay, our_delay / 1000, their_hist.get_value() / 1000, + int(off_target / 1000), uint(max_window), uint32(our_hist.delay_base), + int((our_delay + their_hist.get_value()) / 1000), int(target / 1000), uint(bytes_acked), + (uint)(cur_window - bytes_acked), (float)(scaled_gain), rtt, + (uint)(max_window * 1000 / (rtt_hist.delay_base?rtt_hist.delay_base:50)), + (uint)max_window_user, rto, (int)(rto_timeout - ctx->current_ms), + utp_call_get_microseconds(this->ctx, this), cur_window_packets, (uint)get_packet_size(), + their_hist.delay_base, their_hist.delay_base + their_hist.get_value(), + average_delay, clock_drift, clock_drift_raw, penalty / 1000, + current_delay_sum, current_delay_samples, average_delay_base, + uint64(last_maxed_out_window), int(opt_sndbuf), uint64(ctx->current_ms)); +} + +static void utp_register_recv_packet(UTPSocket *conn, size_t len) +{ + #ifdef _DEBUG + ++conn->_stats.nrecv; + conn->_stats.nbytes_recv += len; + #endif + + if (len <= PACKET_SIZE_MID) { + if (len <= PACKET_SIZE_EMPTY) { + conn->ctx->context_stats._nraw_recv[PACKET_SIZE_EMPTY_BUCKET]++; + } else if (len <= PACKET_SIZE_SMALL) { + conn->ctx->context_stats._nraw_recv[PACKET_SIZE_SMALL_BUCKET]++; + } else + conn->ctx->context_stats._nraw_recv[PACKET_SIZE_MID_BUCKET]++; + } else { + if (len <= PACKET_SIZE_BIG) { + conn->ctx->context_stats._nraw_recv[PACKET_SIZE_BIG_BUCKET]++; + } else + conn->ctx->context_stats._nraw_recv[PACKET_SIZE_HUGE_BUCKET]++; + } +} + +// returns the max number of bytes of payload the uTP +// connection is allowed to send +size_t UTPSocket::get_packet_size() const +{ + int header_size = sizeof(PacketFormatV1); + size_t mtu = mtu_last ? mtu_last : mtu_ceiling; + return mtu - header_size; +} + +// Process an incoming packet +// syn is true if this is the first packet received. It will cut off parsing +// as soon as the header is done +size_t utp_process_incoming(UTPSocket *conn, const byte *packet, size_t len, bool syn = false) +{ + utp_register_recv_packet(conn, len); + + conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn); + + const PacketFormatV1 *pf1 = (PacketFormatV1*)packet; + const byte *packet_end = packet + len; + + uint16 pk_seq_nr = pf1->seq_nr; + uint16 pk_ack_nr = pf1->ack_nr; + uint8 pk_flags = pf1->type(); + + if (pk_flags >= ST_NUM_STATES) return 0; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Got %s. seq_nr:%u ack_nr:%u state:%s timestamp:" I64u " reply_micro:%u" + , flagnames[pk_flags], pk_seq_nr, pk_ack_nr, statenames[conn->state] + , uint64(pf1->tv_usec), (uint32)(pf1->reply_micro)); + #endif + + // mark receipt time + uint64 time = utp_call_get_microseconds(conn->ctx, conn); + + // window packets size is used to calculate a minimum + // permissible range for received acks. connections with acks falling + // out of this range are dropped + const uint16 curr_window = max(conn->cur_window_packets + ACK_NR_ALLOWED_WINDOW, ACK_NR_ALLOWED_WINDOW); + + // ignore packets whose ack_nr is invalid. This would imply a spoofed address + // or a malicious attempt to attach the uTP implementation. + // acking a packet that hasn't been sent yet! + // SYN packets have an exception, since there are no previous packets + if ((pk_flags != ST_SYN || conn->state != CS_SYN_RECV) && + (wrapping_compare_less(conn->seq_nr - 1, pk_ack_nr, ACK_NR_MASK) + || wrapping_compare_less(pk_ack_nr, conn->seq_nr - 1 - curr_window, ACK_NR_MASK))) { +#if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Invalid ack_nr: %u. our seq_nr: %u last unacked: %u" + , pk_ack_nr, conn->seq_nr, (conn->seq_nr - conn->cur_window_packets) & ACK_NR_MASK); +#endif + return 0; + } + + // RSTs are handled earlier, since the connid matches the send id not the recv id + assert(pk_flags != ST_RESET); + + // TODO: maybe send a ST_RESET if we're in CS_RESET? + + const byte *selack_ptr = NULL; + + // Unpack UTP packet options + // Data pointer + const byte *data = (const byte*)pf1 + conn->get_header_size(); + if (conn->get_header_size() > len) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Invalid packet size (less than header size)"); + #endif + + return 0; + } + // Skip the extension headers + uint extension = pf1->ext; + if (extension != 0) { + do { + // Verify that the packet is valid. + data += 2; + + if ((int)(packet_end - data) < 0 || (int)(packet_end - data) < data[-1]) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Invalid len of extensions"); + #endif + + return 0; + } + + switch(extension) { + case 1: // Selective Acknowledgment + selack_ptr = data; + break; + case 2: // extension bits + if (data[-1] != 8) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Invalid len of extension bits header"); + #endif + + return 0; + } + memcpy(conn->extensions, data, 8); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "got extension bits:%02x%02x%02x%02x%02x%02x%02x%02x", + conn->extensions[0], conn->extensions[1], conn->extensions[2], conn->extensions[3], + conn->extensions[4], conn->extensions[5], conn->extensions[6], conn->extensions[7]); + #endif + } + extension = data[-2]; + data += data[-1]; + } while (extension); + } + + if (conn->state == CS_SYN_SENT) { + // if this is a syn-ack, initialize our ack_nr + // to match the sequence number we got from + // the other end + conn->ack_nr = (pk_seq_nr - 1) & SEQ_NR_MASK; + } + + conn->last_got_packet = conn->ctx->current_ms; + + if (syn) { + return 0; + } + + // seqnr is the number of packets past the expected + // packet this is. ack_nr is the last acked, seq_nr is the + // current. Subtracring 1 makes 0 mean "this is the next + // expected packet". + const uint seqnr = (pk_seq_nr - conn->ack_nr - 1) & SEQ_NR_MASK; + + // Getting an invalid sequence number? + if (seqnr >= REORDER_BUFFER_MAX_SIZE) { + if (seqnr >= (SEQ_NR_MASK + 1) - REORDER_BUFFER_MAX_SIZE && pk_flags != ST_STATE) { + conn->schedule_ack(); + } + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, " Got old Packet/Ack (%u/%u)=%u" + , pk_seq_nr, conn->ack_nr, seqnr); + #endif + return 0; + } + + // Process acknowledgment + // acks is the number of packets that was acked + int acks = (pk_ack_nr - (conn->seq_nr - 1 - conn->cur_window_packets)) & ACK_NR_MASK; + + // this happens when we receive an old ack nr + if (acks > conn->cur_window_packets) acks = 0; + + // if we get the same ack_nr as in the last packet + // increase the duplicate_ack counter, otherwise reset + // it to 0. + // It's important to only count ACKs in ST_STATE packets. Any other + // packet (primarily ST_DATA) is likely to have been sent because of the + // other end having new outgoing data, not in response to incoming data. + // For instance, if we're receiving a steady stream of payload with no + // outgoing data, and we suddently have a few bytes of payload to send (say, + // a bittorrent HAVE message), we're very likely to see 3 duplicate ACKs + // immediately after sending our payload packet. This effectively disables + // the fast-resend on duplicate-ack logic for bi-directional connections + // (except in the case of a selective ACK). This is in line with BSD4.4 TCP + // implementation. + if (conn->cur_window_packets > 0) { + if (pk_ack_nr == ((conn->seq_nr - conn->cur_window_packets - 1) & ACK_NR_MASK) + && conn->cur_window_packets > 0 + && pk_flags == ST_STATE) { + ++conn->duplicate_ack; + if (conn->duplicate_ack == DUPLICATE_ACKS_BEFORE_RESEND && conn->mtu_probe_seq) { + // It's likely that the probe was rejected due to its size, but we haven't got an + // ICMP report back yet + if (pk_ack_nr == ((conn->mtu_probe_seq - 1) & ACK_NR_MASK)) { + conn->mtu_ceiling = conn->mtu_probe_size - 1; + conn->mtu_search_update(); + conn->log(UTP_LOG_MTU, "MTU [DUPACK] floor:%d ceiling:%d current:%d" + , conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last); + } else { + // A non-probe was blocked before our probe. + // Can't conclude much, send a new probe + conn->mtu_probe_seq = conn->mtu_probe_size = 0; + } + } + } else { + conn->duplicate_ack = 0; + } + + // TODO: if duplicate_ack == DUPLICATE_ACK_BEFORE_RESEND + // and fast_resend_seq_nr <= ack_nr + 1 + // resend ack_nr + 1 + // also call maybe_decay_win() + } + + // figure out how many bytes were acked + size_t acked_bytes = 0; + + // the minimum rtt of all acks + // this is the upper limit on the delay we get back + // from the other peer. Our delay cannot exceed + // the rtt of the packet. If it does, clamp it. + // this is done in apply_ledbat_ccontrol() + int64 min_rtt = INT64_MAX; + + uint64 now = utp_call_get_microseconds(conn->ctx, conn); + + for (int i = 0; i < acks; ++i) { + int seq = (conn->seq_nr - conn->cur_window_packets + i) & ACK_NR_MASK; + OutgoingPacket *pkt = (OutgoingPacket*)conn->outbuf.get(seq); + if (pkt == 0 || pkt->transmissions == 0) continue; + assert((int)(pkt->payload) >= 0); + acked_bytes += pkt->payload; + if (conn->mtu_probe_seq && seq == conn->mtu_probe_seq) { + conn->mtu_floor = conn->mtu_probe_size; + conn->mtu_search_update(); + conn->log(UTP_LOG_MTU, "MTU [ACK] floor:%d ceiling:%d current:%d" + , conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last); + } + + // in case our clock is not monotonic + if (pkt->time_sent < now) + min_rtt = min(min_rtt, now - pkt->time_sent); + else + min_rtt = min(min_rtt, 50000); + } + + // count bytes acked by EACK + if (selack_ptr != NULL) { + acked_bytes += conn->selective_ack_bytes((pk_ack_nr + 2) & ACK_NR_MASK, + selack_ptr, selack_ptr[-1], min_rtt); + } + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "acks:%d acked_bytes:%u seq_nr:%d cur_window:%u cur_window_packets:%u relative_seqnr:%u max_window:%u min_rtt:%u rtt:%u", + acks, (uint)acked_bytes, conn->seq_nr, (uint)conn->cur_window, conn->cur_window_packets, + seqnr, (uint)conn->max_window, (uint)(min_rtt / 1000), conn->rtt); + #endif + + uint64 p = pf1->tv_usec; + + conn->last_measured_delay = conn->ctx->current_ms; + + // get delay in both directions + // record the delay to report back + const uint32 their_delay = (uint32)(p == 0 ? 0 : time - p); + conn->reply_micro = their_delay; + uint32 prev_delay_base = conn->their_hist.delay_base; + if (their_delay != 0) conn->their_hist.add_sample(their_delay, conn->ctx->current_ms); + + // if their new delay base is less than their previous one + // we should shift our delay base in the other direction in order + // to take the clock skew into account + if (prev_delay_base != 0 && + wrapping_compare_less(conn->their_hist.delay_base, prev_delay_base, TIMESTAMP_MASK)) { + // never adjust more than 10 milliseconds + if (prev_delay_base - conn->their_hist.delay_base <= 10000) { + conn->our_hist.shift(prev_delay_base - conn->their_hist.delay_base); + } + } + + const uint32 actual_delay = (uint32(pf1->reply_micro)==INT_MAX?0:uint32(pf1->reply_micro)); + + // if the actual delay is 0, it means the other end + // hasn't received a sample from us yet, and doesn't + // know what it is. We can't update out history unless + // we have a true measured sample + if (actual_delay != 0) { + conn->our_hist.add_sample(actual_delay, conn->ctx->current_ms); + + // this is keeping an average of the delay samples + // we've recevied within the last 5 seconds. We sum + // all the samples and increase the count in order to + // calculate the average every 5 seconds. The samples + // are based off of the average_delay_base to deal with + // wrapping counters. + if (conn->average_delay_base == 0) conn->average_delay_base = actual_delay; + int64 average_delay_sample = 0; + // distance walking from lhs to rhs, downwards + const uint32 dist_down = conn->average_delay_base - actual_delay; + // distance walking from lhs to rhs, upwards + const uint32 dist_up = actual_delay - conn->average_delay_base; + + if (dist_down > dist_up) { +// assert(dist_up < INT_MAX / 4); + // average_delay_base < actual_delay, we should end up + // with a positive sample + average_delay_sample = dist_up; + } else { +// assert(-int64(dist_down) < INT_MAX / 4); + // average_delay_base >= actual_delay, we should end up + // with a negative sample + average_delay_sample = -int64(dist_down); + } + conn->current_delay_sum += average_delay_sample; + ++conn->current_delay_samples; + + if (conn->ctx->current_ms > conn->average_sample_time) { + + int32 prev_average_delay = conn->average_delay; + + assert(conn->current_delay_sum / conn->current_delay_samples < INT_MAX); + assert(conn->current_delay_sum / conn->current_delay_samples > -INT_MAX); + // write the new average + conn->average_delay = (int32)(conn->current_delay_sum / conn->current_delay_samples); + // each slot represents 5 seconds + conn->average_sample_time += 5000; + + conn->current_delay_sum = 0; + conn->current_delay_samples = 0; + + // this makes things very confusing when logging the average delay +//#if !g_log_utp + // normalize the average samples + // since we're only interested in the slope + // of the curve formed by the average delay samples, + // we can cancel out the actual offset to make sure + // we won't have problems with wrapping. + int min_sample = min(prev_average_delay, conn->average_delay); + int max_sample = max(prev_average_delay, conn->average_delay); + + // normalize around zero. Try to keep the min <= 0 and max >= 0 + int adjust = 0; + if (min_sample > 0) { + // adjust all samples (and the baseline) down by min_sample + adjust = -min_sample; + } else if (max_sample < 0) { + // adjust all samples (and the baseline) up by -max_sample + adjust = -max_sample; + } + if (adjust) { + conn->average_delay_base -= adjust; + conn->average_delay += adjust; + prev_average_delay += adjust; + } +//#endif + + // update the clock drift estimate + // the unit is microseconds per 5 seconds + // what we're doing is just calculating the average of the + // difference between each slot. Since each slot is 5 seconds + // and the timestamps unit are microseconds, we'll end up with + // the average slope across our history. If there is a consistent + // trend, it will show up in this value + + //int64 slope = 0; + int32 drift = conn->average_delay - prev_average_delay; + + // clock_drift is a rolling average + conn->clock_drift = (int64(conn->clock_drift) * 7 + drift) / 8; + conn->clock_drift_raw = drift; + } + } + + // if our new delay base is less than our previous one + // we should shift the other end's delay base in the other + // direction in order to take the clock skew into account + // This is commented out because it creates bad interactions + // with our adjustment in the other direction. We don't really + // need our estimates of the other peer to be very accurate + // anyway. The problem with shifting here is that we're more + // likely shift it back later because of a low latency. This + // second shift back would cause us to shift our delay base + // which then get's into a death spiral of shifting delay bases +/* if (prev_delay_base != 0 && + wrapping_compare_less(conn->our_hist.delay_base, prev_delay_base)) { + // never adjust more than 10 milliseconds + if (prev_delay_base - conn->our_hist.delay_base <= 10000) { + conn->their_hist.Shift(prev_delay_base - conn->our_hist.delay_base); + } + } +*/ + + // if the delay estimate exceeds the RTT, adjust the base_delay to + // compensate + assert(min_rtt >= 0); + if (int64(conn->our_hist.get_value()) > min_rtt) { + conn->our_hist.shift((uint32)(conn->our_hist.get_value() - min_rtt)); + } + + // only apply the congestion controller on acks + // if we don't have a delay measurement, there's + // no point in invoking the congestion control + if (actual_delay != 0 && acked_bytes >= 1) + conn->apply_ccontrol(acked_bytes, actual_delay, min_rtt); + + // sanity check, the other end should never ack packets + // past the point we've sent + if (acks <= conn->cur_window_packets) { + conn->max_window_user = pf1->windowsize; + + // If max user window is set to 0, then we startup a timer + // That will reset it to 1 after 15 seconds. + if (conn->max_window_user == 0) + // Reset max_window_user to 1 every 15 seconds. + conn->zerowindow_time = conn->ctx->current_ms + 15000; + + // Respond to connect message + // Switch to CONNECTED state. + // If this is an ack and we're in still handshaking + // transition over to the connected state. + + // Incoming connection completion + if (pk_flags == ST_DATA && conn->state == CS_SYN_RECV) { + conn->state = CS_CONNECTED; + } + + // Outgoing connection completion + if (pk_flags == ST_STATE && conn->state == CS_SYN_SENT) { + conn->state = CS_CONNECTED; + + // If the user has defined the ON_CONNECT callback, use that to + // notify the user that the socket is now connected. If ON_CONNECT + // has not been defined, notify the user via ON_STATE_CHANGE. + if (conn->ctx->callbacks[UTP_ON_CONNECT]) + utp_call_on_connect(conn->ctx, conn); + else + utp_call_on_state_change(conn->ctx, conn, UTP_STATE_CONNECT); + + // We've sent a fin, and everything was ACKed (including the FIN). + // cur_window_packets == acks means that this packet acked all + // the remaining packets that were in-flight. + } else if (conn->fin_sent && conn->cur_window_packets == acks) { + conn->fin_sent_acked = true; + if (conn->close_requested) { + conn->state = CS_DESTROY; + } + } + + // Update fast resend counter + if (wrapping_compare_less(conn->fast_resend_seq_nr + , (pk_ack_nr + 1) & ACK_NR_MASK, ACK_NR_MASK)) + conn->fast_resend_seq_nr = (pk_ack_nr + 1) & ACK_NR_MASK; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "fast_resend_seq_nr:%u", conn->fast_resend_seq_nr); + #endif + + for (int i = 0; i < acks; ++i) { + int ack_status = conn->ack_packet(conn->seq_nr - conn->cur_window_packets); + // if ack_status is 0, the packet was acked. + // if acl_stauts is 1, it means that the packet had already been acked + // if it's 2, the packet has not been sent yet + // We need to break this loop in the latter case. This could potentially + // happen if we get an ack_nr that does not exceed what we have stuffed + // into the outgoing buffer, but does exceed what we have sent + if (ack_status == 2) { + #ifdef _DEBUG + OutgoingPacket* pkt = (OutgoingPacket*)conn->outbuf.get(conn->seq_nr - conn->cur_window_packets); + assert(pkt->transmissions == 0); + #endif + + break; + } + conn->cur_window_packets--; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "decementing cur_window_packets:%u", conn->cur_window_packets); + #endif + + } + + #ifdef _DEBUG + if (conn->cur_window_packets == 0) + assert(conn->cur_window == 0); + #endif + + // packets in front of this may have been acked by a + // selective ack (EACK). Keep decreasing the window packet size + // until we hit a packet that is still waiting to be acked + // in the send queue + // this is especially likely to happen when the other end + // has the EACK send bug older versions of uTP had + while (conn->cur_window_packets > 0 && !conn->outbuf.get(conn->seq_nr - conn->cur_window_packets)) { + conn->cur_window_packets--; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "decementing cur_window_packets:%u", conn->cur_window_packets); + #endif + + } + + #ifdef _DEBUG + if (conn->cur_window_packets == 0) + assert(conn->cur_window == 0); + #endif + + // this invariant should always be true + assert(conn->cur_window_packets == 0 || conn->outbuf.get(conn->seq_nr - conn->cur_window_packets)); + + // flush Nagle + if (conn->cur_window_packets == 1) { + OutgoingPacket *pkt = (OutgoingPacket*)conn->outbuf.get(conn->seq_nr - 1); + // do we still have quota? + if (pkt->transmissions == 0) { + conn->send_packet(pkt); + } + } + + // Fast timeout-retry + if (conn->fast_timeout) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Fast timeout %u,%u,%u?", (uint)conn->cur_window, conn->seq_nr - conn->timeout_seq_nr, conn->timeout_seq_nr); + #endif + + // if the fast_resend_seq_nr is not pointing to the oldest outstanding packet, it suggests that we've already + // resent the packet that timed out, and we should leave the fast-timeout mode. + if (((conn->seq_nr - conn->cur_window_packets) & ACK_NR_MASK) != conn->fast_resend_seq_nr) { + conn->fast_timeout = false; + } else { + // resend the oldest packet and increment fast_resend_seq_nr + // to not allow another fast resend on it again + OutgoingPacket *pkt = (OutgoingPacket*)conn->outbuf.get(conn->seq_nr - conn->cur_window_packets); + if (pkt && pkt->transmissions > 0) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Packet %u fast timeout-retry.", conn->seq_nr - conn->cur_window_packets); + #endif + + #ifdef _DEBUG + ++conn->_stats.fastrexmit; + #endif + + conn->fast_resend_seq_nr++; + conn->send_packet(pkt); + } + } + } + } + + // Process selective acknowledgent + if (selack_ptr != NULL) { + conn->selective_ack(pk_ack_nr + 2, selack_ptr, selack_ptr[-1]); + } + + // this invariant should always be true + assert(conn->cur_window_packets == 0 || conn->outbuf.get(conn->seq_nr - conn->cur_window_packets)); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "acks:%d acked_bytes:%u seq_nr:%u cur_window:%u cur_window_packets:%u ", + acks, (uint)acked_bytes, conn->seq_nr, (uint)conn->cur_window, conn->cur_window_packets); + #endif + + // In case the ack dropped the current window below + // the max_window size, Mark the socket as writable + if (conn->state == CS_CONNECTED_FULL && !conn->is_full()) { + conn->state = CS_CONNECTED; + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Socket writable. max_window:%u cur_window:%u packet_size:%u", + (uint)conn->max_window, (uint)conn->cur_window, (uint)conn->get_packet_size()); + #endif + utp_call_on_state_change(conn->ctx, conn, UTP_STATE_WRITABLE); + } + + if (pk_flags == ST_STATE) { + // This is a state packet only. + return 0; + } + + // The connection is not in a state that can accept data? + if (conn->state != CS_CONNECTED && + conn->state != CS_CONNECTED_FULL) { + return 0; + } + + // Is this a finalize packet? + if (pk_flags == ST_FIN && !conn->got_fin) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Got FIN eof_pkt:%u", pk_seq_nr); + #endif + + conn->got_fin = true; + conn->eof_pkt = pk_seq_nr; + // at this point, it is possible for the + // other end to have sent packets with + // sequence numbers higher than seq_nr. + // if this is the case, our reorder_count + // is out of sync. This case is dealt with + // when we re-order and hit the eof_pkt. + // we'll just ignore any packets with + // sequence numbers past this + } + + // Getting an in-order packet? + if (seqnr == 0) { + size_t count = packet_end - data; + if (count > 0 && !conn->read_shutdown) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Got Data len:%u (rb:%u)", (uint)count, (uint)utp_call_get_read_buffer_size(conn->ctx, conn)); + #endif + + // Post bytes to the upper layer + utp_call_on_read(conn->ctx, conn, data, count); + } + conn->ack_nr++; + + // Check if the next packet has been received too, but waiting + // in the reorder buffer. + for (;;) { + + if (!conn->got_fin_reached && conn->got_fin && conn->eof_pkt == conn->ack_nr) { + conn->got_fin_reached = true; + conn->rto_timeout = conn->ctx->current_ms + min(conn->rto * 3, 60); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Posting EOF"); + #endif + + utp_call_on_state_change(conn->ctx, conn, UTP_STATE_EOF); + + // if the other end wants to close, ack + conn->send_ack(); + + // reorder_count is not necessarily 0 at this point. + // even though it is most of the time, the other end + // may have sent packets with higher sequence numbers + // than what later end up being eof_pkt + // since we have received all packets up to eof_pkt + // just ignore the ones after it. + conn->reorder_count = 0; + } + + // Quick get-out in case there is nothing to reorder + if (conn->reorder_count == 0) + break; + + // Check if there are additional buffers in the reorder buffers + // that need delivery. + byte *p = (byte*)conn->inbuf.get(conn->ack_nr+1); + if (p == NULL) + break; + conn->inbuf.put(conn->ack_nr+1, NULL); + count = *(uint*)p; + if (count > 0 && !conn->read_shutdown) { + // Pass the bytes to the upper layer + utp_call_on_read(conn->ctx, conn, p + sizeof(uint), count); + } + conn->ack_nr++; + + // Free the element from the reorder buffer + free(p); + assert(conn->reorder_count > 0); + conn->reorder_count--; + } + + conn->schedule_ack(); + } else { + // Getting an out of order packet. + // The packet needs to be remembered and rearranged later. + + // if we have received a FIN packet, and the EOF-sequence number + // is lower than the sequence number of the packet we just received + // something is wrong. + if (conn->got_fin && pk_seq_nr > conn->eof_pkt) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Got an invalid packet sequence number, past EOF " + "reorder_count:%u len:%u (rb:%u)", + conn->reorder_count, (uint)(packet_end - data), (uint)utp_call_get_read_buffer_size(conn->ctx, conn)); + #endif + return 0; + } + + // if the sequence number is entirely off the expected + // one, just drop it. We can't allocate buffer space in + // the inbuf entirely based on untrusted input + if (seqnr > 0x3ff) { + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "0x%08x: Got an invalid packet sequence number, too far off " + "reorder_count:%u len:%u (rb:%u)", + conn->reorder_count, (uint)(packet_end - data), (uint)utp_call_get_read_buffer_size(conn->ctx, conn)); + #endif + return 0; + } + + // we need to grow the circle buffer before we + // check if the packet is already in here, so that + // we don't end up looking at an older packet (since + // the indices wraps around). + conn->inbuf.ensure_size(pk_seq_nr + 1, seqnr + 1); + + // Has this packet already been received? (i.e. a duplicate) + // If that is the case, just discard it. + if (conn->inbuf.get(pk_seq_nr) != NULL) { + #ifdef _DEBUG + ++conn->_stats.nduprecv; + #endif + + return 0; + } + + // Allocate memory to fit the packet that needs to re-ordered + byte *mem = (byte*)malloc((packet_end - data) + sizeof(uint)); + *(uint*)mem = (uint)(packet_end - data); + memcpy(mem + sizeof(uint), data, packet_end - data); + + // Insert into reorder buffer and increment the count + // of # of packets to be reordered. + // we add one to seqnr in order to leave the last + // entry empty, that way the assert in send_ack + // is valid. we have to add one to seqnr too, in order + // to make the circular buffer grow around the correct + // point (which is conn->ack_nr + 1). + assert(conn->inbuf.get(pk_seq_nr) == NULL); + assert((pk_seq_nr & conn->inbuf.mask) != ((conn->ack_nr+1) & conn->inbuf.mask)); + conn->inbuf.put(pk_seq_nr, mem); + conn->reorder_count++; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "0x%08x: Got out of order data reorder_count:%u len:%u (rb:%u)", + conn->reorder_count, (uint)(packet_end - data), (uint)utp_call_get_read_buffer_size(conn->ctx, conn)); + #endif + + conn->schedule_ack(); + } + + return (size_t)(packet_end - data); +} + +inline byte UTP_Version(PacketFormatV1 const* pf) +{ + return (pf->type() < ST_NUM_STATES && pf->ext < 3 ? pf->version() : 0); +} + +UTPSocket::~UTPSocket() +{ + #if UTP_DEBUG_LOGGING + log(UTP_LOG_DEBUG, "Killing socket"); + #endif + + utp_call_on_state_change(ctx, this, UTP_STATE_DESTROYING); + + if (ctx->last_utp_socket == this) { + ctx->last_utp_socket = NULL; + } + + // Remove object from the global hash table + UTPSocketKeyData* kd = ctx->utp_sockets->Delete(UTPSocketKey(addr, conn_id_recv)); + assert(kd); + + // remove the socket from ack_sockets if it was there also + removeSocketFromAckList(this); + + // Free all memory occupied by the socket object. + for (size_t i = 0; i <= inbuf.mask; i++) { + free(inbuf.elements[i]); + } + for (size_t i = 0; i <= outbuf.mask; i++) { + free(outbuf.elements[i]); + } + // TODO: The circular buffer should have a destructor + free(inbuf.elements); + free(outbuf.elements); +} + +void UTP_FreeAll(struct UTPSocketHT *utp_sockets) { + utp_hash_iterator_t it; + UTPSocketKeyData* keyData; + while ((keyData = utp_sockets->Iterate(it))) { + delete keyData->socket; + } +} + +void utp_initialize_socket( utp_socket *conn, + const struct sockaddr *addr, + socklen_t addrlen, + bool need_seed_gen, + uint32 conn_seed, + uint32 conn_id_recv, + uint32 conn_id_send) +{ + PackedSockAddr psaddr = PackedSockAddr((const SOCKADDR_STORAGE*)addr, addrlen); + + if (need_seed_gen) { + do { + conn_seed = utp_call_get_random(conn->ctx, conn); + // we identify v1 and higher by setting the first two bytes to 0x0001 + conn_seed &= 0xffff; + } while (conn->ctx->utp_sockets->Lookup(UTPSocketKey(psaddr, conn_seed))); + + conn_id_recv += conn_seed; + conn_id_send += conn_seed; + } + + conn->state = CS_IDLE; + conn->conn_seed = conn_seed; + conn->conn_id_recv = conn_id_recv; + conn->conn_id_send = conn_id_send; + conn->addr = psaddr; + conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, NULL); + conn->last_got_packet = conn->ctx->current_ms; + conn->last_sent_packet = conn->ctx->current_ms; + conn->last_measured_delay = conn->ctx->current_ms + 0x70000000; + conn->average_sample_time = conn->ctx->current_ms + 5000; + conn->last_rwin_decay = conn->ctx->current_ms - MAX_WINDOW_DECAY; + + conn->our_hist.clear(conn->ctx->current_ms); + conn->their_hist.clear(conn->ctx->current_ms); + conn->rtt_hist.clear(conn->ctx->current_ms); + + // initialize MTU floor and ceiling + conn->mtu_reset(); + conn->mtu_last = conn->mtu_ceiling; + + conn->ctx->utp_sockets->Add(UTPSocketKey(conn->addr, conn->conn_id_recv))->socket = conn; + + // we need to fit one packet in the window when we start the connection + conn->max_window = conn->get_packet_size(); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP socket initialized"); + #endif +} + +utp_socket* utp_create_socket(utp_context *ctx) +{ + assert(ctx); + if (!ctx) return NULL; + + UTPSocket *conn = new UTPSocket; // TODO: UTPSocket should have a constructor + + conn->state = CS_UNINITIALIZED; + conn->ctx = ctx; + conn->userdata = NULL; + conn->reorder_count = 0; + conn->duplicate_ack = 0; + conn->timeout_seq_nr = 0; + conn->last_rcv_win = 0; + conn->got_fin = false; + conn->got_fin_reached = false; + conn->fin_sent = false; + conn->fin_sent_acked = false; + conn->read_shutdown = false; + conn->close_requested = false; + conn->fast_timeout = false; + conn->rtt = 0; + conn->retransmit_timeout = 0; + conn->rto_timeout = 0; + conn->zerowindow_time = 0; + conn->average_delay = 0; + conn->current_delay_samples = 0; + conn->cur_window = 0; + conn->eof_pkt = 0; + conn->last_maxed_out_window = 0; + conn->mtu_probe_seq = 0; + conn->mtu_probe_size = 0; + conn->current_delay_sum = 0; + conn->average_delay_base = 0; + conn->retransmit_count = 0; + conn->rto = 3000; + conn->rtt_var = 800; + conn->seq_nr = 1; + conn->ack_nr = 0; + conn->max_window_user = 255 * PACKET_SIZE; + conn->cur_window_packets = 0; + conn->fast_resend_seq_nr = conn->seq_nr; + conn->target_delay = ctx->target_delay; + conn->reply_micro = 0; + conn->opt_sndbuf = ctx->opt_sndbuf; + conn->opt_rcvbuf = ctx->opt_rcvbuf; + conn->slow_start = true; + conn->ssthresh = conn->opt_sndbuf; + conn->clock_drift = 0; + conn->clock_drift_raw = 0; + conn->outbuf.mask = 15; + conn->inbuf.mask = 15; + conn->outbuf.elements = (void**)calloc(16, sizeof(void*)); + conn->inbuf.elements = (void**)calloc(16, sizeof(void*)); + conn->ida = -1; // set the index of every new socket in ack_sockets to + // -1, which also means it is not in ack_sockets yet + + memset(conn->extensions, 0, sizeof(conn->extensions)); + + #ifdef _DEBUG + memset(&conn->_stats, 0, sizeof(utp_socket_stats)); + #endif + + return conn; +} + +int utp_context_set_option(utp_context *ctx, int opt, int val) +{ + assert(ctx); + if (!ctx) return -1; + + switch (opt) { + case UTP_LOG_NORMAL: + ctx->log_normal = val ? true : false; + return 0; + + case UTP_LOG_MTU: + ctx->log_mtu = val ? true : false; + return 0; + + case UTP_LOG_DEBUG: + ctx->log_debug = val ? true : false; + return 0; + + case UTP_TARGET_DELAY: + ctx->target_delay = val; + return 0; + + case UTP_SNDBUF: + assert(val >= 1); + ctx->opt_sndbuf = val; + return 0; + + case UTP_RCVBUF: + assert(val >= 1); + ctx->opt_rcvbuf = val; + return 0; + } + return -1; +} + +int utp_context_get_option(utp_context *ctx, int opt) +{ + assert(ctx); + if (!ctx) return -1; + + switch (opt) { + case UTP_LOG_NORMAL: return ctx->log_normal ? 1 : 0; + case UTP_LOG_MTU: return ctx->log_mtu ? 1 : 0; + case UTP_LOG_DEBUG: return ctx->log_debug ? 1 : 0; + case UTP_TARGET_DELAY: return ctx->target_delay; + case UTP_SNDBUF: return ctx->opt_sndbuf; + case UTP_RCVBUF: return ctx->opt_rcvbuf; + } + return -1; +} + + +int utp_setsockopt(UTPSocket* conn, int opt, int val) +{ + assert(conn); + if (!conn) return -1; + + switch (opt) { + + case UTP_SNDBUF: + assert(val >= 1); + conn->opt_sndbuf = val; + return 0; + + case UTP_RCVBUF: + assert(val >= 1); + conn->opt_rcvbuf = val; + return 0; + + case UTP_TARGET_DELAY: + conn->target_delay = val; + return 0; + } + + return -1; +} + +int utp_getsockopt(UTPSocket* conn, int opt) +{ + assert(conn); + if (!conn) return -1; + + switch (opt) { + case UTP_SNDBUF: return conn->opt_sndbuf; + case UTP_RCVBUF: return conn->opt_rcvbuf; + case UTP_TARGET_DELAY: return conn->target_delay; + } + + return -1; +} + +// Try to connect to a specified host. +int utp_connect(utp_socket *conn, const struct sockaddr *to, socklen_t tolen) +{ + assert(conn); + if (!conn) return -1; + + assert(conn->state == CS_UNINITIALIZED); + if (conn->state != CS_UNINITIALIZED) { + conn->state = CS_DESTROY; + return -1; + } + + utp_initialize_socket(conn, to, tolen, true, 0, 0, 1); + + assert(conn->cur_window_packets == 0); + assert(conn->outbuf.get(conn->seq_nr) == NULL); + assert(sizeof(PacketFormatV1) == 20); + + conn->state = CS_SYN_SENT; + conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn); + + // Create and send a connect message + + // used in parse_log.py + conn->log(UTP_LOG_NORMAL, "UTP_Connect conn_seed:%u packet_size:%u (B) " + "target_delay:%u (ms) delay_history:%u " + "delay_base_history:%u (minutes)", + conn->conn_seed, PACKET_SIZE, conn->target_delay / 1000, + CUR_DELAY_SIZE, DELAY_BASE_HISTORY); + + // Setup initial timeout timer. + conn->retransmit_timeout = 3000; + conn->rto_timeout = conn->ctx->current_ms + conn->retransmit_timeout; + conn->last_rcv_win = conn->get_rcv_window(); + + // if you need compatibiltiy with 1.8.1, use this. it increases attackability though. + //conn->seq_nr = 1; + conn->seq_nr = utp_call_get_random(conn->ctx, conn); + + // Create the connect packet. + const size_t header_size = sizeof(PacketFormatV1); + + OutgoingPacket *pkt = (OutgoingPacket*)malloc(sizeof(OutgoingPacket) - 1 + header_size); + PacketFormatV1* p1 = (PacketFormatV1*)pkt->data; + + memset(p1, 0, header_size); + // SYN packets are special, and have the receive ID in the connid field, + // instead of conn_id_send. + p1->set_version(1); + p1->set_type(ST_SYN); + p1->ext = 0; + p1->connid = conn->conn_id_recv; + p1->windowsize = (uint32)conn->last_rcv_win; + p1->seq_nr = conn->seq_nr; + pkt->transmissions = 0; + pkt->length = header_size; + pkt->payload = 0; + + /* + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Sending connect %s [%u].", + addrfmt(conn->addr, addrbuf), conn_seed); + #endif + */ + + // Remember the message in the outgoing queue. + conn->outbuf.ensure_size(conn->seq_nr, conn->cur_window_packets); + conn->outbuf.put(conn->seq_nr, pkt); + conn->seq_nr++; + conn->cur_window_packets++; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "incrementing cur_window_packets:%u", conn->cur_window_packets); + #endif + + conn->send_packet(pkt); + return 0; +} + +// Returns 1 if the UDP payload was recognized as a UTP packet, or 0 if it was not +int utp_process_udp(utp_context *ctx, const byte *buffer, size_t len, const struct sockaddr *to, socklen_t tolen) +{ + assert(ctx); + if (!ctx) return 0; + + assert(buffer); + if (!buffer) return 0; + + assert(to); + if (!to) return 0; + + const PackedSockAddr addr((const SOCKADDR_STORAGE*)to, tolen); + + if (len < sizeof(PacketFormatV1)) { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv %s len:%u too small", addrfmt(addr, addrbuf), (uint)len); + #endif + return 0; + } + + const PacketFormatV1 *pf1 = (PacketFormatV1*)buffer; + const byte version = UTP_Version(pf1); + const uint32 id = uint32(pf1->connid); + + if (version != 1) { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv %s len:%u version:%u unsupported version", addrfmt(addr, addrbuf), (uint)len, version); + #endif + + return 0; + } + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv %s len:%u id:%u", addrfmt(addr, addrbuf), (uint)len, id); + ctx->log(UTP_LOG_DEBUG, NULL, "recv id:%u seq_nr:%u ack_nr:%u", id, (uint)pf1->seq_nr, (uint)pf1->ack_nr); + #endif + + const byte flags = pf1->type(); + + if (flags == ST_RESET) { + // id is either our recv id or our send id + // if it's our send id, and we initiated the connection, our recv id is id + 1 + // if it's our send id, and we did not initiate the connection, our recv id is id - 1 + // we have to check every case + + UTPSocketKeyData* keyData; + if ( (keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id))) || + ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1))) && keyData->socket->conn_id_send == id) || + ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id - 1))) && keyData->socket->conn_id_send == id)) + { + UTPSocket* conn = keyData->socket; + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv RST for existing connection"); + #endif + + if (conn->close_requested) + conn->state = CS_DESTROY; + else + conn->state = CS_RESET; + + utp_call_on_overhead_statistics(conn->ctx, conn, false, len + conn->get_udp_overhead(), close_overhead); + const int err = (conn->state == CS_SYN_SENT) ? UTP_ECONNREFUSED : UTP_ECONNRESET; + utp_call_on_error(conn->ctx, conn, err); + } + else { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv RST for unknown connection"); + #endif + } + return 1; + } + else if (flags != ST_SYN) { + UTPSocket* conn = NULL; + + if (ctx->last_utp_socket && ctx->last_utp_socket->addr == addr && ctx->last_utp_socket->conn_id_recv == id) { + conn = ctx->last_utp_socket; + } else { + UTPSocketKeyData* keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id)); + if (keyData) { + conn = keyData->socket; + ctx->last_utp_socket = conn; + } + } + + if (conn) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv processing"); + #endif + + const size_t read = utp_process_incoming(conn, buffer, len); + utp_call_on_overhead_statistics(conn->ctx, conn, false, (len - read) + conn->get_udp_overhead(), header_overhead); + return 1; + } + } + + // We have not found a matching utp_socket, and this isn't a SYN. Reject it. + const uint32 seq_nr = pf1->seq_nr; + if (flags != ST_SYN) { + ctx->current_ms = utp_call_get_milliseconds(ctx, NULL); + + for (size_t i = 0; i < ctx->rst_info.GetCount(); i++) { + if ((ctx->rst_info[i].connid == id) && + (ctx->rst_info[i].addr == addr) && + (ctx->rst_info[i].ack_nr == seq_nr)) + { + ctx->rst_info[i].timestamp = ctx->current_ms; + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv not sending RST to non-SYN (stored)"); + #endif + + return 1; + } + } + + if (ctx->rst_info.GetCount() > RST_INFO_LIMIT) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv not sending RST to non-SYN (limit at %u stored)", (uint)ctx->rst_info.GetCount()); + #endif + + return 1; + } + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv send RST to non-SYN (%u stored)", (uint)ctx->rst_info.GetCount()); + #endif + + RST_Info &r = ctx->rst_info.Append(); + r.addr = addr; + r.connid = id; + r.ack_nr = seq_nr; + r.timestamp = ctx->current_ms; + + UTPSocket::send_rst(ctx, addr, id, seq_nr, utp_call_get_random(ctx, NULL)); + return 1; + } + + if (ctx->callbacks[UTP_ON_ACCEPT]) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "Incoming connection from %s", addrfmt(addr, addrbuf)); + #endif + + UTPSocketKeyData* keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1)); + if (keyData) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "rejected incoming connection, connection already exists"); + #endif + + return 1; + } + + if (ctx->utp_sockets->GetCount() > 3000) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "rejected incoming connection, too many uTP sockets %d", ctx->utp_sockets->GetCount()); + #endif + + return 1; + } + // true means yes, block connection. false means no, don't block. + if (utp_call_on_firewall(ctx, to, tolen)) { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "rejected incoming connection, firewall callback returned true"); + #endif + + return 1; + } + + // Create a new UTP socket to handle this new connection + UTPSocket *conn = utp_create_socket(ctx); + utp_initialize_socket(conn, to, tolen, false, id, id+1, id); + conn->ack_nr = seq_nr; + conn->seq_nr = utp_call_get_random(ctx, NULL); + conn->fast_resend_seq_nr = conn->seq_nr; + conn->state = CS_SYN_RECV; + + const size_t read = utp_process_incoming(conn, buffer, len, true); + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "recv send connect ACK"); + #endif + + conn->send_ack(true); + + utp_call_on_accept(ctx, conn, to, tolen); + + // we report overhead after on_accept(), because the callbacks are setup now + utp_call_on_overhead_statistics(conn->ctx, conn, false, (len - read) + conn->get_udp_overhead(), header_overhead); // SYN + utp_call_on_overhead_statistics(conn->ctx, conn, true, conn->get_overhead(), ack_overhead); // SYNACK + } + else { + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "rejected incoming connection, UTP_ON_ACCEPT callback not set"); + #endif + + } + + return 1; +} + +// Called by utp_process_icmp_fragmentation() and utp_process_icmp_error() below +static UTPSocket* parse_icmp_payload(utp_context *ctx, const byte *buffer, size_t len, const struct sockaddr *to, socklen_t tolen) +{ + assert(ctx); + if (!ctx) return NULL; + + assert(buffer); + if (!buffer) return NULL; + + assert(to); + if (!to) return NULL; + + const PackedSockAddr addr((const SOCKADDR_STORAGE*)to, tolen); + + // ICMP packets are only required to quote the first 8 bytes of the layer4 + // payload. The UDP payload is 8 bytes, and the UTP header is another 20 + // bytes. So, in order to find the entire UTP header, we need the ICMP + // packet to quote 28 bytes. + if (len < sizeof(PacketFormatV1)) { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "Ignoring ICMP from %s: runt length %d", addrfmt(addr, addrbuf), len); + #endif + return NULL; + } + + const PacketFormatV1 *pf = (PacketFormatV1*)buffer; + const byte version = UTP_Version(pf); + const uint32 id = uint32(pf->connid); + + if (version != 1) { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "Ignoring ICMP from %s: not UTP version 1", addrfmt(addr, addrbuf)); + #endif + return NULL; + } + + UTPSocketKeyData* keyData; + + if ( (keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id))) || + ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1))) && keyData->socket->conn_id_send == id) || + ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id - 1))) && keyData->socket->conn_id_send == id)) + { + return keyData->socket; + } + + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "Ignoring ICMP from %s: No matching connection found for id %u", addrfmt(addr, addrbuf), id); + #endif + return NULL; +} + +// Should be called when an ICMP Type 3, Code 4 packet (fragmentation needed) is received, to adjust the MTU +// +// Returns 1 if the UDP payload (delivered in the ICMP packet) was recognized as a UTP packet, or 0 if it was not +// +// @ctx: utp_context +// @buf: Contents of the original UDP payload, which the ICMP packet quoted. *Not* the ICMP packet itself. +// @len: buffer length +// @to: destination address of the original UDP pakcet +// @tolen: address length +// @next_hop_mtu: +int utp_process_icmp_fragmentation(utp_context *ctx, const byte* buffer, size_t len, const struct sockaddr *to, socklen_t tolen, uint16 next_hop_mtu) +{ + UTPSocket* conn = parse_icmp_payload(ctx, buffer, len, to, tolen); + if (!conn) return 0; + + // Constrain the next_hop_mtu to sane values. It might not be initialized or sent properly + if (next_hop_mtu >= 576 && next_hop_mtu < 0x2000) { + conn->mtu_ceiling = min(next_hop_mtu, conn->mtu_ceiling); + conn->mtu_search_update(); + // this is something of a speecial case, where we don't set mtu_last + // to the value in between the floor and the ceiling. We can update the + // floor, because there might be more network segments after the one + // that sent this ICMP with smaller MTUs. But we want to test this + // MTU size first. If the next probe gets through, mtu_floor is updated + conn->mtu_last = conn->mtu_ceiling; + } else { + // Otherwise, binary search. At this point we don't actually know + // what size the packet that failed was, and apparently we can't + // trust the next hop mtu either. It seems reasonably conservative + // to just lower the ceiling. This should not happen on working networks + // anyway. + conn->mtu_ceiling = (conn->mtu_floor + conn->mtu_ceiling) / 2; + conn->mtu_search_update(); + } + + conn->log(UTP_LOG_MTU, "MTU [ICMP] floor:%d ceiling:%d current:%d", conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last); + return 1; +} + +// Should be called when an ICMP message is received that should tear down the connection. +// +// Returns 1 if the UDP payload (delivered in the ICMP packet) was recognized as a UTP packet, or 0 if it was not +// +// @ctx: utp_context +// @buf: Contents of the original UDP payload, which the ICMP packet quoted. *Not* the ICMP packet itself. +// @len: buffer length +// @to: destination address of the original UDP pakcet +// @tolen: address length +int utp_process_icmp_error(utp_context *ctx, const byte *buffer, size_t len, const struct sockaddr *to, socklen_t tolen) +{ + UTPSocket* conn = parse_icmp_payload(ctx, buffer, len, to, tolen); + if (!conn) return 0; + + const int err = (conn->state == CS_SYN_SENT) ? UTP_ECONNREFUSED : UTP_ECONNRESET; + const PackedSockAddr addr((const SOCKADDR_STORAGE*)to, tolen); + + switch(conn->state) { + // Don't pass on errors for idle/closed connections + case CS_IDLE: + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "ICMP from %s in state CS_IDLE, ignoring", addrfmt(addr, addrbuf)); + #endif + return 1; + + default: + if (conn->close_requested) { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "ICMP from %s after close, setting state to CS_DESTROY and causing error %d", addrfmt(addr, addrbuf), err); + #endif + conn->state = CS_DESTROY; + } else { + #if UTP_DEBUG_LOGGING + ctx->log(UTP_LOG_DEBUG, NULL, "ICMP from %s, setting state to CS_RESET and causing error %d", addrfmt(addr, addrbuf), err); + #endif + conn->state = CS_RESET; + } + break; + } + + utp_call_on_error(conn->ctx, conn, err); + return 1; +} + +// Write bytes to the UTP socket. Returns the number of bytes written. +// 0 indicates the socket is no longer writable, -1 indicates an error +ssize_t utp_writev(utp_socket *conn, struct utp_iovec *iovec_input, size_t num_iovecs) +{ + static utp_iovec iovec[UTP_IOV_MAX]; + + assert(conn); + if (!conn) return -1; + + assert(iovec_input); + if (!iovec_input) return -1; + + assert(num_iovecs); + if (!num_iovecs) return -1; + + if (num_iovecs > UTP_IOV_MAX) + num_iovecs = UTP_IOV_MAX; + + memcpy(iovec, iovec_input, sizeof(struct utp_iovec)*num_iovecs); + + size_t bytes = 0; + size_t sent = 0; + for (size_t i = 0; i < num_iovecs; i++) + bytes += iovec[i].iov_len; + + #if UTP_DEBUG_LOGGING + size_t param = bytes; + #endif + + if (conn->state != CS_CONNECTED) { + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = false (not CS_CONNECTED)", (uint)bytes); + #endif + return 0; + } + + if (conn->fin_sent) { + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = false (fin_sent already)", (uint)bytes); + #endif + return 0; + } + + conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn); + + // don't send unless it will all fit in the window + size_t packet_size = conn->get_packet_size(); + size_t num_to_send = min(bytes, packet_size); + while (!conn->is_full(num_to_send)) { + // Send an outgoing packet. + // Also add it to the outgoing of packets that have been sent but not ACKed. + + bytes -= num_to_send; + sent += num_to_send; + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Sending packet. seq_nr:%u ack_nr:%u wnd:%u/%u/%u rcv_win:%u size:%u cur_window_packets:%u", + conn->seq_nr, conn->ack_nr, + (uint)(conn->cur_window + num_to_send), + (uint)conn->max_window, (uint)conn->max_window_user, + (uint)conn->last_rcv_win, num_to_send, + conn->cur_window_packets); + #endif + conn->write_outgoing_packet(num_to_send, ST_DATA, iovec, num_iovecs); + num_to_send = min(bytes, packet_size); + + if (num_to_send == 0) { + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = true", (uint)param); + #endif + return sent; + } + } + + bool full = conn->is_full(); + if (full) { + // mark the socket as not being writable. + conn->state = CS_CONNECTED_FULL; + } + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = %s", (uint)bytes, full ? "false" : "true"); + #endif + + // returns whether or not the socket is still writable + // if the congestion window is not full, we can still write to it + //return !full; + return sent; +} + +void utp_read_drained(utp_socket *conn) +{ + assert(conn); + if (!conn) return; + + assert(conn->state != CS_UNINITIALIZED); + if (conn->state == CS_UNINITIALIZED) return; + + const size_t rcvwin = conn->get_rcv_window(); + + if (rcvwin > conn->last_rcv_win) { + // If last window was 0 send ACK immediately, otherwise should set timer + if (conn->last_rcv_win == 0) { + conn->send_ack(); + } else { + conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn); + conn->schedule_ack(); + } + } +} + +// Should be called each time the UDP socket is drained +void utp_issue_deferred_acks(utp_context *ctx) +{ + assert(ctx); + if (!ctx) return; + + for (size_t i = 0; i < ctx->ack_sockets.GetCount(); i++) { + UTPSocket *conn = ctx->ack_sockets[i]; + conn->send_ack(); + i--; + } +} + +// Should be called every 500ms +void utp_check_timeouts(utp_context *ctx) +{ + assert(ctx); + if (!ctx) return; + + ctx->current_ms = utp_call_get_milliseconds(ctx, NULL); + + if (ctx->current_ms - ctx->last_check < TIMEOUT_CHECK_INTERVAL) + return; + + ctx->last_check = ctx->current_ms; + + for (size_t i = 0; i < ctx->rst_info.GetCount(); i++) { + if ((int)(ctx->current_ms - ctx->rst_info[i].timestamp) >= RST_INFO_TIMEOUT) { + ctx->rst_info.MoveUpLast(i); + i--; + } + } + if (ctx->rst_info.GetCount() != ctx->rst_info.GetAlloc()) { + ctx->rst_info.Compact(); + } + + utp_hash_iterator_t it; + UTPSocketKeyData* keyData; + while ((keyData = ctx->utp_sockets->Iterate(it))) { + UTPSocket *conn = keyData->socket; + conn->check_timeouts(); + + // Check if the object was deleted + if (conn->state == CS_DESTROY) { + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "Destroying"); + #endif + delete conn; + } + } +} + +int utp_getpeername(utp_socket *conn, struct sockaddr *addr, socklen_t *addrlen) +{ + assert(addr); + if (!addr) return -1; + + assert(addrlen); + if (!addrlen) return -1; + + assert(conn); + if (!conn) return -1; + + assert(conn->state != CS_UNINITIALIZED); + if (conn->state == CS_UNINITIALIZED) return -1; + + socklen_t len; + const SOCKADDR_STORAGE sa = conn->addr.get_sockaddr_storage(&len); + *addrlen = min(len, *addrlen); + memcpy(addr, &sa, *addrlen); + return 0; +} + +int utp_get_delays(UTPSocket *conn, uint32 *ours, uint32 *theirs, uint32 *age) +{ + assert(conn); + if (!conn) return -1; + + assert(conn->state != CS_UNINITIALIZED); + if (conn->state == CS_UNINITIALIZED) { + if (ours) *ours = 0; + if (theirs) *theirs = 0; + if (age) *age = 0; + return -1; + } + + if (ours) *ours = conn->our_hist.get_value(); + if (theirs) *theirs = conn->their_hist.get_value(); + if (age) *age = (uint32)(conn->ctx->current_ms - conn->last_measured_delay); + return 0; +} + +// Close the UTP socket. +// It is not valid for the upper layer to refer to socket after it is closed. +// Data will keep to try being delivered after the close. +void utp_close(UTPSocket *conn) +{ + assert(conn); + if (!conn) return; + + assert(conn->state != CS_UNINITIALIZED + && conn->state != CS_DESTROY); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Close in state:%s", statenames[conn->state]); + #endif + + switch(conn->state) { + case CS_CONNECTED: + case CS_CONNECTED_FULL: + conn->read_shutdown = true; + conn->close_requested = true; + if (!conn->fin_sent) { + conn->fin_sent = true; + conn->write_outgoing_packet(0, ST_FIN, NULL, 0); + } else if (conn->fin_sent_acked) { + conn->state = CS_DESTROY; + } + break; + + case CS_SYN_SENT: + conn->rto_timeout = utp_call_get_milliseconds(conn->ctx, conn) + min(conn->rto * 2, 60); + // fall through + case CS_SYN_RECV: + // fall through + default: + conn->state = CS_DESTROY; + break; + } + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_Close end in state:%s", statenames[conn->state]); + #endif +} + +void utp_shutdown(UTPSocket *conn, int how) +{ + assert(conn); + if (!conn) return; + + assert(conn->state != CS_UNINITIALIZED + && conn->state != CS_DESTROY); + + #if UTP_DEBUG_LOGGING + conn->log(UTP_LOG_DEBUG, "UTP_shutdown(%d) in state:%s", how, statenames[conn->state]); + #endif + + if (how != SHUT_WR) { + conn->read_shutdown = true; + } + if (how != SHUT_RD) { + switch(conn->state) { + case CS_CONNECTED: + case CS_CONNECTED_FULL: + if (!conn->fin_sent) { + conn->fin_sent = true; + conn->write_outgoing_packet(0, ST_FIN, NULL, 0); + } + break; + case CS_SYN_SENT: + conn->rto_timeout = utp_call_get_milliseconds(conn->ctx, conn) + min(conn->rto * 2, 60); + default: + break; + } + } +} + +utp_context* utp_get_context(utp_socket *socket) { + assert(socket); + return socket ? socket->ctx : NULL; +} + +void* utp_set_userdata(utp_socket *socket, void *userdata) { + assert(socket); + if (socket) socket->userdata = userdata; + return socket ? socket->userdata : NULL; +} + +void* utp_get_userdata(utp_socket *socket) { + assert(socket); + return socket ? socket->userdata : NULL; +} + +void struct_utp_context::log(int level, utp_socket *socket, char const *fmt, ...) +{ + if (!would_log(level)) { + return; + } + + va_list va; + va_start(va, fmt); + log_unchecked(socket, fmt, va); + va_end(va); +} + +void struct_utp_context::log_unchecked(utp_socket *socket, char const *fmt, ...) +{ + va_list va; + char buf[4096]; + + va_start(va, fmt); + vsnprintf(buf, 4096, fmt, va); + buf[4095] = '\0'; + va_end(va); + + utp_call_log(this, socket, (const byte *)buf); +} + +inline bool struct_utp_context::would_log(int level) +{ + if (level == UTP_LOG_NORMAL) return log_normal; + if (level == UTP_LOG_MTU) return log_mtu; + if (level == UTP_LOG_DEBUG) return log_debug; + return true; +} + +utp_socket_stats* utp_get_stats(utp_socket *socket) +{ + #ifdef _DEBUG + assert(socket); + if (!socket) return NULL; + socket->_stats.mtu_guess = socket->mtu_last ? socket->mtu_last : socket->mtu_ceiling; + return &socket->_stats; + #else + return NULL; + #endif +} diff --git a/libutp/utp_internal.h b/libutp/utp_internal.h new file mode 100644 index 000000000..641814630 --- /dev/null +++ b/libutp/utp_internal.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_INTERNAL_H__ +#define __UTP_INTERNAL_H__ + +#include +#include +#include +#include + +#include "utp.h" +#include "utp_callbacks.h" +#include "utp_templates.h" +#include "utp_hash.h" +#include "utp_hash.h" +#include "utp_packedsockaddr.h" + +/* These originally lived in utp_config.h */ +#define CCONTROL_TARGET (100 * 1000) // us + +enum bandwidth_type_t { + payload_bandwidth, connect_overhead, + close_overhead, ack_overhead, + header_overhead, retransmit_overhead +}; + +#ifdef WIN32 + #ifdef _MSC_VER + #include "libutp_inet_ntop.h" + #endif + + // newer versions of MSVC define these in errno.h + #ifndef ECONNRESET + #define ECONNRESET WSAECONNRESET + #define EMSGSIZE WSAEMSGSIZE + #define ECONNREFUSED WSAECONNREFUSED + #define ETIMEDOUT WSAETIMEDOUT + #endif +#endif + +struct PACKED_ATTRIBUTE RST_Info { + PackedSockAddr addr; + uint32 connid; + uint16 ack_nr; + uint64 timestamp; +}; + +// It's really important that we don't have duplicate keys in the hash table. +// If we do, we'll eventually crash. if we try to remove the second instance +// of the key, we'll accidentally remove the first instead. then later, +// checkTimeouts will try to access the second one's already freed memory. +void UTP_FreeAll(struct UTPSocketHT *utp_sockets); + +struct UTPSocketKey { + PackedSockAddr addr; + uint32 recv_id; // "conn_seed", "conn_id" + + UTPSocketKey(const PackedSockAddr& _addr, uint32 _recv_id) { + memset(this, 0, sizeof(*this)); + addr = _addr; + recv_id = _recv_id; + } + + bool operator == (const UTPSocketKey &other) const { + return recv_id == other.recv_id && addr == other.addr; + } + + uint32 compute_hash() const { + return recv_id ^ addr.compute_hash(); + } +}; + +struct UTPSocketKeyData { + UTPSocketKey key; + UTPSocket *socket; + utp_link_t link; +}; + +#define UTP_SOCKET_BUCKETS 79 +#define UTP_SOCKET_INIT 15 + +struct UTPSocketHT : utpHashTable { + UTPSocketHT() { + const int buckets = UTP_SOCKET_BUCKETS; + const int initial = UTP_SOCKET_INIT; + this->Create(buckets, initial); + } + ~UTPSocketHT() { + UTP_FreeAll(this); + this->Free(); + } +}; + +struct struct_utp_context { + void *userdata; + utp_callback_t* callbacks[UTP_ARRAY_SIZE]; + + uint64 current_ms; + utp_context_stats context_stats; + UTPSocket *last_utp_socket; + Array ack_sockets; + Array rst_info; + UTPSocketHT *utp_sockets; + size_t target_delay; + size_t opt_sndbuf; + size_t opt_rcvbuf; + uint64 last_check; + + struct_utp_context(); + ~struct_utp_context(); + + void log(int level, utp_socket *socket, char const *fmt, ...); + void log_unchecked(utp_socket *socket, char const *fmt, ...); + bool would_log(int level); + + bool log_normal:1; // log normal events? + bool log_mtu:1; // log MTU related events? + bool log_debug:1; // log debugging events? (Must also compile with UTP_DEBUG_LOGGING defined) +}; + +#endif //__UTP_INTERNAL_H__ diff --git a/libutp/utp_packedsockaddr.cpp b/libutp/utp_packedsockaddr.cpp new file mode 100644 index 000000000..ab65ae56c --- /dev/null +++ b/libutp/utp_packedsockaddr.cpp @@ -0,0 +1,139 @@ +// vim:set ts=4 sw=4 ai: + +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include + +#include "utp_types.h" +#include "utp_hash.h" +#include "utp_packedsockaddr.h" + +#include "libutp_inet_ntop.h" + +byte PackedSockAddr::get_family() const +{ + #if defined(__sh__) + return ((_sin6d[0] == 0) && (_sin6d[1] == 0) && (_sin6d[2] == htonl(0xffff)) != 0) ? + AF_INET : AF_INET6; + #else + return (IN6_IS_ADDR_V4MAPPED(&_in._in6addr) != 0) ? AF_INET : AF_INET6; + #endif // defined(__sh__) +} + +bool PackedSockAddr::operator==(const PackedSockAddr& rhs) const +{ + if (&rhs == this) + return true; + if (_port != rhs._port) + return false; + return memcmp(_sin6, rhs._sin6, sizeof(_sin6)) == 0; +} + +bool PackedSockAddr::operator!=(const PackedSockAddr& rhs) const +{ + return !(*this == rhs); +} + +uint32 PackedSockAddr::compute_hash() const { + return utp_hash_mem(&_in, sizeof(_in)) ^ _port; +} + +void PackedSockAddr::set(const SOCKADDR_STORAGE* sa, socklen_t len) +{ + if (sa->ss_family == AF_INET) { + assert(len >= sizeof(sockaddr_in)); + const sockaddr_in *sin = (sockaddr_in*)sa; + _sin6w[0] = 0; + _sin6w[1] = 0; + _sin6w[2] = 0; + _sin6w[3] = 0; + _sin6w[4] = 0; + _sin6w[5] = 0xffff; + _sin4 = sin->sin_addr.s_addr; + _port = ntohs(sin->sin_port); + } else { + assert(len >= sizeof(sockaddr_in6)); + const sockaddr_in6 *sin6 = (sockaddr_in6*)sa; + _in._in6addr = sin6->sin6_addr; + _port = ntohs(sin6->sin6_port); + } +} + +PackedSockAddr::PackedSockAddr(const SOCKADDR_STORAGE* sa, socklen_t len) +{ + set(sa, len); +} + +PackedSockAddr::PackedSockAddr(void) +{ + SOCKADDR_STORAGE sa; + socklen_t len = sizeof(SOCKADDR_STORAGE); + memset(&sa, 0, len); + sa.ss_family = AF_INET; + set(&sa, len); +} + +SOCKADDR_STORAGE PackedSockAddr::get_sockaddr_storage(socklen_t *len = NULL) const +{ + SOCKADDR_STORAGE sa; + const byte family = get_family(); + if (family == AF_INET) { + sockaddr_in *sin = (sockaddr_in*)&sa; + if (len) *len = sizeof(sockaddr_in); + memset(sin, 0, sizeof(sockaddr_in)); + sin->sin_family = family; + sin->sin_port = htons(_port); + sin->sin_addr.s_addr = _sin4; + } else { + sockaddr_in6 *sin6 = (sockaddr_in6*)&sa; + memset(sin6, 0, sizeof(sockaddr_in6)); + if (len) *len = sizeof(sockaddr_in6); + sin6->sin6_family = family; + sin6->sin6_addr = _in._in6addr; + sin6->sin6_port = htons(_port); + } + return sa; +} + +// #define addrfmt(x, s) x.fmt(s, sizeof(s)) +cstr PackedSockAddr::fmt(str s, size_t len) const +{ + memset(s, 0, len); + const byte family = get_family(); + str i; + if (family == AF_INET) { + INET_NTOP(family, (uint32*)&_sin4, s, len); + i = s; + while (*++i) {} + } else { + i = s; + *i++ = '['; + INET_NTOP(family, (in6_addr*)&_in._in6addr, i, len-1); + while (*++i) {} + *i++ = ']'; + } + snprintf(i, len - (i-s), ":%u", _port); + return s; +} diff --git a/libutp/utp_packedsockaddr.h b/libutp/utp_packedsockaddr.h new file mode 100644 index 000000000..76e8accaa --- /dev/null +++ b/libutp/utp_packedsockaddr.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __UTP_PACKEDSOCKADDR_H__ +#define __UTP_PACKEDSOCKADDR_H__ + +#include "utp_types.h" + +struct PACKED_ATTRIBUTE PackedSockAddr { + // The values are always stored here in network byte order + union { + byte _in6[16]; // IPv6 + uint16 _in6w[8]; // IPv6, word based (for convenience) + uint32 _in6d[4]; // Dword access + in6_addr _in6addr; // For convenience + } _in; + + // Host byte order + uint16 _port; + + #define _sin4 _in._in6d[3] // IPv4 is stored where it goes if mapped + + #define _sin6 _in._in6 + #define _sin6w _in._in6w + #define _sin6d _in._in6d + + byte get_family() const; + bool operator==(const PackedSockAddr& rhs) const; + bool operator!=(const PackedSockAddr& rhs) const; + void set(const SOCKADDR_STORAGE* sa, socklen_t len); + + PackedSockAddr(const SOCKADDR_STORAGE* sa, socklen_t len); + PackedSockAddr(void); + + SOCKADDR_STORAGE get_sockaddr_storage(socklen_t *len) const; + cstr fmt(str s, size_t len) const; + + uint32 compute_hash() const; +} ALIGNED_ATTRIBUTE(4); + +#endif //__UTP_PACKEDSOCKADDR_H__ diff --git a/libutp/utp_templates.h b/libutp/utp_templates.h new file mode 100644 index 000000000..8f88f5c7c --- /dev/null +++ b/libutp/utp_templates.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __TEMPLATES_H__ +#define __TEMPLATES_H__ + +#include "utp_types.h" +#include + +#if defined(POSIX) +/* Allow over-writing FORCEINLINE from makefile because gcc 3.4.4 for buffalo + doesn't seem to support __attribute__((always_inline)) in -O0 build + (strangely, it works in -Os build) */ +#ifndef FORCEINLINE +// The always_inline attribute asks gcc to inline the function even if no optimization is being requested. +// This macro should be used exclusive-or with the inline directive (use one or the other but not both) +// since Microsoft uses __forceinline to also mean inline, +// and this code is following a Microsoft compatibility model. +// Just setting the attribute without also specifying the inline directive apparently won't inline the function, +// as evidenced by multiply-defined symbols found at link time. +#define FORCEINLINE inline __attribute__((always_inline)) +#endif +#endif + +// Utility templates +#undef min +#undef max + +template static inline T min(T a, T b) { if (a < b) return a; return b; } +template static inline T max(T a, T b) { if (a > b) return a; return b; } + +template static inline T min(T a, T b, T c) { return min(min(a,b),c); } +template static inline T max(T a, T b, T c) { return max(max(a,b),c); } +template static inline T clamp(T v, T mi, T ma) +{ + if (v > ma) v = ma; + if (v < mi) v = mi; + return v; +} + +#if (defined(__SVR4) && defined(__sun)) + #pragma pack(1) +#else + #pragma pack(push,1) +#endif + + +namespace aux +{ + FORCEINLINE uint16 host_to_network(uint16 i) { return htons(i); } + FORCEINLINE uint32 host_to_network(uint32 i) { return htonl(i); } + FORCEINLINE int32 host_to_network(int32 i) { return htonl(i); } + FORCEINLINE uint16 network_to_host(uint16 i) { return ntohs(i); } + FORCEINLINE uint32 network_to_host(uint32 i) { return ntohl(i); } + FORCEINLINE int32 network_to_host(int32 i) { return ntohl(i); } +} + +template +struct PACKED_ATTRIBUTE big_endian +{ + T operator=(T i) { m_integer = aux::host_to_network(i); return i; } + operator T() const { return aux::network_to_host(m_integer); } +private: + T m_integer; +}; + +typedef big_endian int32_big; +typedef big_endian uint32_big; +typedef big_endian uint16_big; + +#if (defined(__SVR4) && defined(__sun)) + #pragma pack(0) +#else + #pragma pack(pop) +#endif + +template static inline void zeromem(T *a, size_t count = 1) { memset(a, 0, count * sizeof(T)); } + +typedef int SortCompareProc(const void *, const void *); + +template static FORCEINLINE void QuickSortT(T *base, size_t num, int (*comp)(const T *, const T *)) { qsort(base, num, sizeof(T), (SortCompareProc*)comp); } + + +// WARNING: The template parameter MUST be a POD type! +template class Array { +protected: + T *mem; + size_t alloc,count; + +public: + Array(size_t init) { Init(init); } + Array() { Init(); } + ~Array() { Free(); } + + void inline Init() { mem = NULL; alloc = count = 0; } + void inline Init(size_t init) { Init(); if (init) Resize(init); } + size_t inline GetCount() const { return count; } + size_t inline GetAlloc() const { return alloc; } + void inline SetCount(size_t c) { count = c; } + + inline T& operator[](size_t offset) { assert(offset ==0 || offset(minsize, alloc * 2)); } + + inline size_t Append(const T &t) { + if (count >= alloc) Grow(); + size_t r=count++; + mem[r] = t; + return r; + } + + T inline &Append() { + if (count >= alloc) Grow(); + return mem[count++]; + } + + void inline Compact() { + Resize(count); + } + + void inline Free() { + free(mem); + Init(); + } + + void inline Clear() { + count = 0; + } + + bool inline MoveUpLast(size_t index) { + assert(index < count); + size_t c = --count; + if (index != c) { + mem[index] = mem[c]; + return true; + } + return false; + } + + bool inline MoveUpLastExist(const T &v) { + return MoveUpLast(LookupElementExist(v)); + } + + size_t inline LookupElement(const T &v) const { + for(size_t i = 0; i != count; i++) + if (mem[i] == v) + return i; + return (size_t) -1; + } + + bool inline HasElement(const T &v) const { + return LookupElement(v) != -1; + } + + typedef int SortCompareProc(const T *a, const T *b); + + void Sort(SortCompareProc* proc, size_t start, size_t end) { + QuickSortT(&mem[start], end - start, proc); + } + + void Sort(SortCompareProc* proc, size_t start) { + Sort(proc, start, count); + } + + void Sort(SortCompareProc* proc) { + Sort(proc, 0, count); + } +}; + +#endif //__TEMPLATES_H__ diff --git a/libutp/utp_utils.cpp b/libutp/utp_utils.cpp new file mode 100644 index 000000000..f2c57abe4 --- /dev/null +++ b/libutp/utp_utils.cpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include "utp.h" +#include "utp_types.h" + +#ifdef WIN32 + #define WIN32_LEAN_AND_MEAN + #include + #include + #include +#else //!WIN32 + #include + #include // Linux needs both time.h and sys/time.h +#endif + +#if defined(__APPLE__) + #include +#endif + +#include "utp_utils.h" + +#ifdef WIN32 + +typedef ULONGLONG (WINAPI GetTickCount64Proc)(void); +static GetTickCount64Proc *pt2GetTickCount64; +static GetTickCount64Proc *pt2RealGetTickCount; + +static uint64 startPerformanceCounter; +static uint64 startGetTickCount; +// MSVC 6 standard doesn't like division with uint64s +static double counterPerMicrosecond; + +static uint64 UTGetTickCount64() +{ + if (pt2GetTickCount64) { + return pt2GetTickCount64(); + } + if (pt2RealGetTickCount) { + uint64 v = pt2RealGetTickCount(); + // fix return value from GetTickCount + return (DWORD)v | ((v >> 0x18) & 0xFFFFFFFF00000000); + } + return (uint64)GetTickCount(); +} + +static void Time_Initialize() +{ + HMODULE kernel32 = GetModuleHandleA("kernel32.dll"); + pt2GetTickCount64 = (GetTickCount64Proc*)GetProcAddress(kernel32, "GetTickCount64"); + // not a typo. GetTickCount actually returns 64 bits + pt2RealGetTickCount = (GetTickCount64Proc*)GetProcAddress(kernel32, "GetTickCount"); + + uint64 frequency; + QueryPerformanceCounter((LARGE_INTEGER*)&startPerformanceCounter); + QueryPerformanceFrequency((LARGE_INTEGER*)&frequency); + counterPerMicrosecond = (double)frequency / 1000000.0f; + startGetTickCount = UTGetTickCount64(); +} + +static int64 abs64(int64 x) { return x < 0 ? -x : x; } + +static uint64 __GetMicroseconds() +{ + static bool time_init = false; + if (!time_init) { + time_init = true; + Time_Initialize(); + } + + uint64 counter; + uint64 tick; + + QueryPerformanceCounter((LARGE_INTEGER*) &counter); + tick = UTGetTickCount64(); + + // unfortunately, QueryPerformanceCounter is not guaranteed + // to be monotonic. Make it so. + int64 ret = (int64)(((int64)counter - (int64)startPerformanceCounter) / counterPerMicrosecond); + // if the QPC clock leaps more than one second off GetTickCount64() + // something is seriously fishy. Adjust QPC to stay monotonic + int64 tick_diff = tick - startGetTickCount; + if (abs64(ret / 100000 - tick_diff / 100) > 10) { + startPerformanceCounter -= (uint64)((int64)(tick_diff * 1000 - ret) * counterPerMicrosecond); + ret = (int64)((counter - startPerformanceCounter) / counterPerMicrosecond); + } + return ret; +} + +static inline uint64 UTP_GetMilliseconds() +{ + return GetTickCount(); +} + +#else //!WIN32 + +static inline uint64 UTP_GetMicroseconds(void); +static inline uint64 UTP_GetMilliseconds() +{ + return UTP_GetMicroseconds() / 1000; +} + +#if defined(__APPLE__) + +static uint64 __GetMicroseconds() +{ + // http://developer.apple.com/mac/library/qa/qa2004/qa1398.html + // http://www.macresearch.org/tutorial_performance_and_time + static mach_timebase_info_data_t sTimebaseInfo; + static uint64_t start_tick = 0; + uint64_t tick; + // Returns a counter in some fraction of a nanoseconds + tick = mach_absolute_time(); + if (sTimebaseInfo.denom == 0) { + // Get the timer ratio to convert mach_absolute_time to nanoseconds + mach_timebase_info(&sTimebaseInfo); + start_tick = tick; + } + // Calculate the elapsed time, convert it to microseconds and return it. + return ((tick - start_tick) * sTimebaseInfo.numer) / (sTimebaseInfo.denom * 1000); +} + +#else // !__APPLE__ + +#if ! (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(CLOCK_MONOTONIC)) + #warning "Using non-monotonic function gettimeofday() in UTP_GetMicroseconds()" +#endif + +/* Unfortunately, #ifdef CLOCK_MONOTONIC is not enough to make sure that + POSIX clocks work -- we could be running a recent libc with an ancient + kernel (think OpenWRT). -- jch */ + +static uint64_t __GetMicroseconds() +{ + struct timeval tv; + + #if defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(CLOCK_MONOTONIC) + static int have_posix_clocks = -1; + int rc; + + if (have_posix_clocks < 0) { + struct timespec ts; + rc = clock_gettime(CLOCK_MONOTONIC, &ts); + if (rc < 0) { + have_posix_clocks = 0; + } else { + have_posix_clocks = 1; + } + } + + if (have_posix_clocks) { + struct timespec ts; + rc = clock_gettime(CLOCK_MONOTONIC, &ts); + return uint64(ts.tv_sec) * 1000000 + uint64(ts.tv_nsec) / 1000; + } + #endif + + gettimeofday(&tv, NULL); + return uint64(tv.tv_sec) * 1000000 + tv.tv_usec; +} + +#endif //!__APPLE__ + +#endif //!WIN32 + +/* + * Whew. Okay. After that #ifdef maze above, we now know we have a working + * __GetMicroseconds() implementation on all platforms. + * + * Because there are a number of assertions in libutp that will cause a crash + * if monotonic time isn't monotonic, now apply some safety checks. While in + * principle we're already protecting ourselves in cases where non-monotonic + * time is likely to happen, this protects all versions. + */ + +static inline uint64 UTP_GetMicroseconds() +{ + static uint64 offset = 0, previous = 0; + + uint64 now = __GetMicroseconds() + offset; + if (previous > now) { + /* Eek! */ + offset += previous - now; + now = previous; + } + previous = now; + return now; +} + +#define ETHERNET_MTU 1500 +#define IPV4_HEADER_SIZE 20 +#define IPV6_HEADER_SIZE 40 +#define UDP_HEADER_SIZE 8 +#define GRE_HEADER_SIZE 24 +#define PPPOE_HEADER_SIZE 8 +#define MPPE_HEADER_SIZE 2 +// packets have been observed in the wild that were fragmented +// with a payload of 1416 for the first fragment +// There are reports of routers that have MTU sizes as small as 1392 +#define FUDGE_HEADER_SIZE 36 +#define TEREDO_MTU 1280 + +#define UDP_IPV4_OVERHEAD (IPV4_HEADER_SIZE + UDP_HEADER_SIZE) +#define UDP_IPV6_OVERHEAD (IPV6_HEADER_SIZE + UDP_HEADER_SIZE) +#define UDP_TEREDO_OVERHEAD (UDP_IPV4_OVERHEAD + UDP_IPV6_OVERHEAD) + +#define UDP_IPV4_MTU (ETHERNET_MTU - IPV4_HEADER_SIZE - UDP_HEADER_SIZE - GRE_HEADER_SIZE - PPPOE_HEADER_SIZE - MPPE_HEADER_SIZE - FUDGE_HEADER_SIZE) +#define UDP_IPV6_MTU (ETHERNET_MTU - IPV6_HEADER_SIZE - UDP_HEADER_SIZE - GRE_HEADER_SIZE - PPPOE_HEADER_SIZE - MPPE_HEADER_SIZE - FUDGE_HEADER_SIZE) +#define UDP_TEREDO_MTU (TEREDO_MTU - IPV6_HEADER_SIZE - UDP_HEADER_SIZE) + +uint64 utp_default_get_udp_mtu(utp_callback_arguments *args) { + // Since we don't know the local address of the interface, + // be conservative and assume all IPv6 connections are Teredo. + return (args->address->sa_family == AF_INET6) ? UDP_TEREDO_MTU : UDP_IPV4_MTU; +} + +uint64 utp_default_get_udp_overhead(utp_callback_arguments *args) { + // Since we don't know the local address of the interface, + // be conservative and assume all IPv6 connections are Teredo. + return (args->address->sa_family == AF_INET6) ? UDP_TEREDO_OVERHEAD : UDP_IPV4_OVERHEAD; +} + +uint64 utp_default_get_random(utp_callback_arguments *args) { + return rand(); +} + +uint64 utp_default_get_milliseconds(utp_callback_arguments *args) { + return UTP_GetMilliseconds(); +} + +uint64 utp_default_get_microseconds(utp_callback_arguments *args) { + return UTP_GetMicroseconds(); +} diff --git a/libutp/utp_utils.h b/libutp/utp_utils.h new file mode 100644 index 000000000..7eb0c5562 --- /dev/null +++ b/libutp/utp_utils.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010-2013 BitTorrent, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +uint64 utp_default_get_udp_mtu(utp_callback_arguments *args); +uint64 utp_default_get_udp_overhead(utp_callback_arguments *args); +uint64 utp_default_get_random(utp_callback_arguments *args); +uint64 utp_default_get_milliseconds(utp_callback_arguments *args); +uint64 utp_default_get_microseconds(utp_callback_arguments *args); diff --git a/llarp/address_info.cpp b/llarp/address_info.cpp index f8cabdb42..699827c34 100644 --- a/llarp/address_info.cpp +++ b/llarp/address_info.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace llarp { @@ -23,6 +24,20 @@ namespace llarp return *this; } + bool + AddressInfo::operator==(const AddressInfo &other) const + { + // we don't care about rank + return pubkey == other.pubkey && port == other.port + && dialect == other.dialect && ip == other.ip; + } + + bool + AddressInfo::operator<(const AddressInfo &other) const + { + return rank < other.rank || ip < other.ip || port < other.port; + } + bool AddressInfo::DecodeKey(llarp_buffer_t key, llarp_buffer_t *buf) { diff --git a/llarp/curvecp/client.cpp b/llarp/curvecp/client.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/llarp/curvecp/impl.cpp b/llarp/curvecp/impl.cpp deleted file mode 100644 index 09a62395e..000000000 --- a/llarp/curvecp/impl.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include "router.hpp" - -namespace llarp -{ - namespace curvecp - { - struct LinkLayer : public llarp::ILinkLayer - { - LinkLayer(llarp_router* r) : llarp::ILinkLayer(r) - { - } - - ~LinkLayer() - { - } - - const char* - Name() const - { - return "curvecp"; - } - }; - - std::unique_ptr< llarp::ILinkLayer > - NewServer(llarp_router* r) - { - return std::unique_ptr< llarp::ILinkLayer >(new LinkLayer(r)); - } - } // namespace curvecp - -} // namespace llarp diff --git a/llarp/curvecp/server.cpp b/llarp/link/curvecp.cpp similarity index 85% rename from llarp/curvecp/server.cpp rename to llarp/link/curvecp.cpp index 8634d3eff..5936a5394 100644 --- a/llarp/curvecp/server.cpp +++ b/llarp/link/curvecp.cpp @@ -1,5 +1,6 @@ #include #include "router.hpp" +#include namespace llarp { diff --git a/llarp/link/encoder.hpp b/llarp/link/encoder.hpp index 216983a11..74be89372 100644 --- a/llarp/link/encoder.hpp +++ b/llarp/link/encoder.hpp @@ -10,7 +10,8 @@ namespace llarp /// encode Link Introduce Message onto a buffer /// if router is nullptr then the LIM's r member is omitted. bool - EncodeLIM(llarp_buffer_t* buff, const RouterContact* router); + EncodeLIM(llarp_buffer_t* buff, const RouterContact* router, + const KeyExchangeNonce& n); } // namespace llarp #endif diff --git a/llarp/link/server.cpp b/llarp/link/server.cpp index 551d2aa11..800d0d55e 100644 --- a/llarp/link/server.cpp +++ b/llarp/link/server.cpp @@ -60,22 +60,9 @@ namespace llarp } } - void - ILinkLayer::RecvFrom(const Addr& from, const void* buf, size_t sz) - { - util::Lock l(m_SessionsMutex); - auto itr = m_Sessions.find(from); - if(itr == m_Sessions.end()) - m_Sessions - .insert(std::make_pair( - from, std::unique_ptr< ILinkSession >(NewInboundSession(from)))) - .first->second->Recv(buf, sz); - else - itr->second->Recv(buf, sz); - } - bool - ILinkLayer::PickAddress(const RouterContact& rc, llarp::Addr& picked) const + ILinkLayer::PickAddress(const RouterContact& rc, + llarp::AddressInfo& picked) const { std::string OurDialect = Name(); for(const auto& addr : rc.addrs) @@ -92,18 +79,18 @@ namespace llarp void ILinkLayer::TryEstablishTo(const RouterContact& rc) { - llarp::Addr to; + llarp::AddressInfo to; if(!PickAddress(rc, to)) return; util::Lock l(m_SessionsMutex); - auto itr = m_Sessions.find(to); + llarp::Addr addr(to); + auto itr = m_Sessions.find(addr); if(itr == m_Sessions.end()) m_Sessions .insert(std::make_pair( - to, std::unique_ptr< ILinkSession >(NewOutboundSession(rc)))) - .first->second->Handshake(); - else - itr->second->Handshake(); + addr, + std::unique_ptr< ILinkSession >(NewOutboundSession(rc, to)))) + .first->second->Start(); } bool diff --git a/llarp/link/utp.cpp b/llarp/link/utp.cpp new file mode 100644 index 000000000..968f0e75c --- /dev/null +++ b/llarp/link/utp.cpp @@ -0,0 +1,576 @@ +#include +#include "router.hpp" +#include +#include +#include +#include + +namespace llarp +{ + namespace utp + { + constexpr size_t FragmentBufferSize = 1088; + constexpr size_t FragmentHashSize = 32; + constexpr size_t FragmentNonceSize = 24; + constexpr size_t FragmentOverheadSize = + FragmentHashSize + FragmentNonceSize; + constexpr size_t FragmentBodySize = + FragmentBufferSize - FragmentOverheadSize; + +#ifdef __int128 + typedef unsigned __int128 Long_t; +#else + typedef uint64_t Long_t; +#endif + + typedef llarp::AlignedBuffer< FragmentBufferSize, false, Long_t > + FragmentBuffer; + typedef llarp::AlignedBuffer< MAX_LINK_MSG_SIZE, false, Long_t > + MessageBuffer; + + struct LinkLayer; + + struct BaseSession : public ILinkSession + { + utp_socket* sock; + llarp_router* router; + RouterContact remoteRC; + Addr remoteAddr; + SharedSecret sessionKey; + llarp_time_t lastActive; + llarp_time_t sessionTimeout = 10 * 1000; + std::queue< FragmentBuffer > sendq; + FragmentBuffer recvBuf; + size_t recvBufOffset = 0; + MessageBuffer recvMsg; + size_t recvMsgOffset = 0; + + enum State + { + eInitial, + eConnecting, + eLinkEstablished, // when utp connection is established + eCryptoHandshake, // crypto handshake initiated + eSessionReady, // session is ready + eClose // utp connection is closed + }; + + State state; + + virtual void + LinkEstablished() = 0; + + void + EnterState(State st) + { + state = st; + lastActive = llarp_time_now_ms(); + } + + BaseSession(llarp_router* r, utp_socket* s); + virtual ~BaseSession(); + + void + Pump() + { + // TODO: use utp_writev + while(sendq.size()) + { + auto& front = sendq.front(); + write_ll(front.data(), front.size()); + sendq.pop(); + } + } + + void + write_ll(void* buf, size_t sz) + { + llarp::LogDebug("utp_write ", sz, " bytes to ", remoteAddr); + ssize_t wrote = utp_write(sock, buf, sz); + if(wrote < 0) + { + llarp::LogWarn("utp_write returned ", wrote); + } + llarp::LogDebug("utp_write wrote ", wrote, " bytes to ", remoteAddr); + } + + bool + VerifyThenDecrypt(FragmentBuffer& buf) + { + ShortHash digest; + if(!router->crypto.hmac( + digest, + InitBuffer(buf.data() + FragmentHashSize, + FragmentBufferSize - FragmentHashSize), + sessionKey)) + { + llarp::LogError("keyed hash failed"); + return false; + } + if(digest != ShortHash(buf.data())) + { + llarp::LogError("Message Integrity Failed"); + return false; + } + AlignedBuffer< FragmentNonceSize > nonce(buf.data() + FragmentHashSize); + + auto body = InitBuffer(buf.data() + FragmentOverheadSize, + FragmentBufferSize - FragmentOverheadSize); + + router->crypto.xchacha20(body, sessionKey, nonce); + + uint32_t upper, lower; + if(!(llarp_buffer_read_uint32(&body, &upper) + && llarp_buffer_read_uint32(&body, &lower))) + return false; + bool fragmentEnd = upper == 0; + if(lower > recvMsgOffset + recvMsg.size()) + { + llarp::LogError("Fragment too big: ", lower, " bytes"); + return false; + } + byte_t* ptr = recvMsg.data() + recvMsgOffset; + memcpy(ptr, body.cur, lower); + recvMsgOffset += lower; + if(fragmentEnd) + { + // got a message + auto msgbuf = InitBuffer(recvMsg.data(), recvMsgOffset); + recvMsgOffset = 0; + return router->HandleRecvLinkMessageBuffer(this, msgbuf); + } + return true; + } + + void + EncryptThenHash(FragmentBuffer& buf, const byte_t* ptr, uint32_t sz, + bool isLastFragment) + + { + buf.Randomize(); + const byte_t* nonce = buf.data() + FragmentHashSize; + byte_t* body = buf.data() + FragmentOverheadSize; + byte_t* base = body; + if(isLastFragment) + htobe32buf(body, 0); + body += sizeof(uint32_t); + htobe32buf(body, sz); + body += sizeof(uint32_t); + memcpy(body, ptr, sz); + auto payload = InitBuffer(base, FragmentBodySize); + router->crypto.xchacha20(payload, sessionKey, nonce); + router->crypto.hmac(buf, payload, sessionKey); + } + + bool + SendMessageBuffer(llarp_buffer_t buf) + { + if(state != eSessionReady) + return false; + size_t sz = buf.sz; + while(sz) + { + uint32_t s = + std::min((FragmentBodySize - (llarp_randint() % 128)), sz); + sendq.emplace(); + EncryptThenHash(sendq.back(), buf.cur, s, ((sz - s) == 0)); + buf.cur += s; + sz -= s; + } + return true; + } + + void + DoKeyExchange(llarp_transport_dh_func dh, const KeyExchangeNonce& n, + const PubKey& other, const SecretKey& secret) + { + if(!dh(sessionKey, other, secret, n)) + { + llarp::LogError("key exchange with ", other, " failed"); + SendClose(); + return; + } + EnterState(eSessionReady); + } + + void + Tick(llarp_time_t now) + { + } + + bool + SendKeepAlive() + { + return true; + } + + void + SendClose() + { + if(state != eClose) + { + utp_set_userdata(sock, nullptr); + utp_close(sock); + } + EnterState(eClose); + sock = nullptr; + } + + bool + IsEstablished() const + { + return state == eSessionReady; + } + + bool + Recv(const void* buf, size_t sz) + { + const byte_t* ptr = (const byte_t*)buf; + llarp::LogDebug("utp read ", sz, " from ", remoteAddr); + while(sz + recvBufOffset > FragmentBufferSize) + { + memcpy(recvBuf.data() + recvBufOffset, ptr, FragmentBufferSize); + sz -= FragmentBufferSize; + ptr += FragmentBufferSize; + VerifyThenDecrypt(recvBuf); + recvBufOffset = 0; + } + memcpy(recvBuf.data() + recvBufOffset, ptr, sz); + if(sz + recvBufOffset <= FragmentBufferSize) + { + recvBufOffset = 0; + VerifyThenDecrypt(recvBuf); + } + else + recvBufOffset += sz; + return true; + } + + void + RecvHandshake(const void* buf, size_t sz, ILinkLayer* parent) + { + if((recvBuf.size() - recvBufOffset) < sz) + { + llarp::LogInfo("handshake too big"); + SendClose(); + return; + } + memcpy(recvBuf.data() + recvBufOffset, buf, sz); + recvBufOffset += sz; + + if(recvBufOffset > 8) + { + // process handshake header + uint8_t* ptr = recvBuf.data(); + uint32_t version = bufbe32toh(ptr); + if(version != LLARP_PROTO_VERSION) + { + llarp::LogWarn("protocol version missmatch ", version, + " != ", LLARP_PROTO_VERSION); + return; + } + ptr += sizeof(uint32_t); + uint32_t limsz = bufbe32toh(ptr); + ptr += sizeof(uint32_t); + if(((sizeof(uint32_t) * 2) + limsz) > sz) + { + // not enough data + // TODO: don't bail here, continue reading + SendClose(); + } + LinkIntroMessage msg(this); + auto mbuf = InitBuffer(ptr, limsz); + if(!msg.BDecode(&mbuf)) + { + llarp::LogError("malfromed LIM from ", remoteAddr); + return; + } + if(!msg.HandleMessage(router)) + { + llarp::LogError("failed to handle LIM from ", remoteAddr); + SendClose(); + return; + } + remoteRC = msg.rc; + DoKeyExchange(router->crypto.dh_server, msg.N, msg.rc.enckey, + parent->TransportSecretKey()); + } + } + + bool + TimedOut(llarp_time_t now) const + { + if(now < lastActive) + return false; + return lastActive - now > sessionTimeout; + } + + const PubKey& + GetPubKey() const + { + return remoteRC.pubkey; + } + + const Addr& + GetRemoteEndpoint() const + { + return remoteAddr; + } + + void + MarkEstablished(); + }; + + struct LinkLayer : public ILinkLayer + { + utp_context* _utp_ctx = nullptr; + + static uint64 + OnRead(utp_callback_arguments* arg) + { + BaseSession* self = + static_cast< BaseSession* >(utp_get_userdata(arg->socket)); + if(self) + { + if(self->IsEstablished()) + self->Recv(arg->buf, arg->len); + else + { + LinkLayer* parent = static_cast< LinkLayer* >( + utp_context_get_userdata(arg->context)); + self->RecvHandshake(arg->buf, arg->len, parent); + } + utp_read_drained(arg->socket); + } + else + { + llarp::LogWarn("utp_socket got data with no underlying session"); + } + return 0; + } + + static uint64 + SendTo(utp_callback_arguments* arg) + { + LinkLayer* l = + static_cast< LinkLayer* >(utp_context_get_userdata(arg->context)); + llarp_ev_udp_sendto(&l->m_udp, arg->address, arg->buf, arg->len); + return 0; + } + + static uint64 + OnConnect(utp_callback_arguments* arg) + { + BaseSession* session = + static_cast< BaseSession* >(utp_get_userdata(arg->socket)); + session->LinkEstablished(); + return 0; + } + + static uint64 + OnAccept(utp_callback_arguments*); + + LinkLayer(llarp_router* r) : ILinkLayer(r) + { + _utp_ctx = utp_init(2); + utp_context_set_userdata(_utp_ctx, this); + utp_set_callback(_utp_ctx, UTP_SENDTO, &LinkLayer::SendTo); + utp_set_callback(_utp_ctx, UTP_ON_ACCEPT, &LinkLayer::OnAccept); + utp_set_callback(_utp_ctx, UTP_ON_CONNECT, &LinkLayer::OnConnect); + utp_set_callback(_utp_ctx, UTP_ON_READ, &LinkLayer::OnRead); + } + + ~LinkLayer() + { + utp_destroy(_utp_ctx); + } + + uint16_t + Rank() const + { + return 1; + } + + void + RecvFrom(const Addr& from, const void* buf, size_t sz) + { + utp_process_udp(_utp_ctx, (const byte_t*)buf, sz, from, from.SockLen()); + } + + void + Pump() + { + utp_check_timeouts(_utp_ctx); + utp_issue_deferred_acks(_utp_ctx); + ILinkLayer::Pump(); + } + + void + Stop() + { + } + + bool + KeyGen(SecretKey& k) + { + m_router->crypto.encryption_keygen(k); + return true; + } + + ILinkSession* + NewOutboundSession(const RouterContact& rc, const AddressInfo& addr); + + ILinkSession* + NewInboundSession(const Addr& addr); + + utp_socket* + NewSocket() + { + return utp_create_socket(_utp_ctx); + } + + const char* + Name() const + { + return "utp"; + } + }; + + std::unique_ptr< ILinkLayer > + NewServer(llarp_router* r) + { + return std::unique_ptr< ILinkLayer >(new LinkLayer(r)); + } + + struct OutboundSession : public BaseSession + { + PubKey remoteTransportPubKey; + + OutboundSession(llarp_router* r, utp_socket* s, const RouterContact& rc, + const AddressInfo& addr) + : BaseSession(r, s) + { + remoteRC = rc; + remoteAddr = addr; + remoteTransportPubKey = addr.pubkey; + } + + void + LinkEstablished() + { + llarp::LogDebug("link established with ", remoteAddr); + EnterState(eLinkEstablished); + KeyExchangeNonce nonce; + nonce.Randomize(); + SendHandshake(nonce); + EnterState(eCryptoHandshake); + DoKeyExchange(router->crypto.dh_client, nonce, remoteTransportPubKey, + router->encryption); + } + + // send our RC to the remote + void + SendHandshake(const KeyExchangeNonce& n) + { + byte_t tmp[MAX_RC_SIZE + 128] = {0}; + auto buf = StackBuffer< decltype(tmp) >(tmp); + // fastforward buffer for handshake to fit before + buf.cur += sizeof(uint32_t) * 2; + + LinkIntroMessage msg(this); + msg.rc = router->rc; + msg.N = n; + if(!msg.BEncode(&buf)) + return; + + uint32_t sz = buf.cur - buf.base; + sz -= sizeof(uint32_t) * 2; + // write handshake header + buf.cur = buf.base; + llarp_buffer_put_uint32(&buf, LLARP_PROTO_VERSION); + llarp_buffer_put_uint32(&buf, sz); + // send it + write_ll(buf.base, sz); + } + + void + Start() + { + utp_connect(sock, remoteAddr, remoteAddr.SockLen()); + EnterState(eConnecting); + } + }; + + struct InboundSession : public BaseSession + { + InboundSession(llarp_router* r, utp_socket* s, const Addr& addr) + : BaseSession(r, s) + { + remoteAddr = addr; + } + + void + Start() + { + } + + void + LinkEstablished() + { + EnterState(eLinkEstablished); + } + }; + + BaseSession::BaseSession(llarp_router* r, utp_socket* s) + { + router = r; + sock = s; + utp_set_userdata(sock, this); + lastActive = llarp_time_now_ms(); + } + + BaseSession::~BaseSession() + { + } + + ILinkSession* + LinkLayer::NewOutboundSession(const RouterContact& rc, + const AddressInfo& addr) + { + return new OutboundSession(m_router, utp_create_socket(_utp_ctx), rc, + addr); + } + + ILinkSession* + LinkLayer::NewInboundSession(const Addr& addr) + { + return nullptr; + } + + uint64 + LinkLayer::OnAccept(utp_callback_arguments* arg) + { + LinkLayer* self = + static_cast< LinkLayer* >(utp_context_get_userdata(arg->context)); + Addr remote(*arg->address); + llarp::LogDebug("utp accepted from ", remote); + if(self->HasSessionVia(remote)) + { + // TODO should we do this? + llarp::LogWarn( + "utp socket closed because we already have a session " + "via ", + remote); + utp_close(arg->socket); + return 0; + } + InboundSession* session = + new InboundSession(self->m_router, arg->socket, remote); + self->PutSession(remote, session); + session->LinkEstablished(); + return 0; + } + + } // namespace utp + +} // namespace llarp diff --git a/llarp/link_intro.cpp b/llarp/link_intro.cpp index b73eb877a..45f665d3e 100644 --- a/llarp/link_intro.cpp +++ b/llarp/link_intro.cpp @@ -48,9 +48,14 @@ namespace llarp if(!bencode_write_bytestring(buf, "i", 1)) return false; + if(!bencode_write_bytestring(buf, "n", 1)) + return false; + if(!N.BEncode(buf)) + return false; + if(!bencode_write_bytestring(buf, "r", 1)) return false; - if(rc.BEncode(buf)) + if(!rc.BEncode(buf)) return false; if(!bencode_write_version_entry(buf)) @@ -62,6 +67,8 @@ namespace llarp bool LinkIntroMessage::HandleMessage(llarp_router* router) const { + if(!rc.VerifySignature(&router->crypto)) + return false; router->async_verify_RC(rc, !rc.IsPublicRouter()); return true; } diff --git a/llarp/net.cpp b/llarp/net.cpp index 33d453d60..3550074aa 100644 --- a/llarp/net.cpp +++ b/llarp/net.cpp @@ -14,19 +14,17 @@ bool operator==(const sockaddr& a, const sockaddr& b) { - socklen_t sz = sizeof(a.sa_data); + if(a.sa_family != b.sa_family) + return false; switch(a.sa_family) { case AF_INET: - sz = sizeof(sockaddr_in); - break; + return *((const sockaddr_in*)&a) == *((const sockaddr_in*)&b); case AF_INET6: - sz = sizeof(sockaddr_in6); - break; + return *((const sockaddr_in6*)&a) == *((const sockaddr_in6*)&b); default: - break; + return false; } - return a.sa_family == b.sa_family && memcmp(a.sa_data, b.sa_data, sz) == 0; } bool @@ -41,6 +39,24 @@ operator<(const in6_addr& a, const in6_addr& b) return memcmp(&a, &b, sizeof(in6_addr)) < 0; } +bool +operator==(const in6_addr& a, const in6_addr& b) +{ + return memcmp(&a, &b, sizeof(in6_addr)) == 0; +} + +bool +operator==(const sockaddr_in& a, const sockaddr_in& b) +{ + return a.sin_port == b.sin_port && a.sin_addr.s_addr == b.sin_addr.s_addr; +} + +bool +operator==(const sockaddr_in6& a, const sockaddr_in6& b) +{ + return a.sin6_port == b.sin6_port && a.sin6_addr == b.sin6_addr; +} + #ifdef _WIN32 #include #include