1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #ifndef _RDSV3_IB_H 26 #define _RDSV3_IB_H 27 28 #include <sys/rds.h> 29 30 #include <sys/ib/clients/rdsv3/rdsv3.h> 31 #include <sys/ib/clients/rdsv3/rdma_transport.h> 32 #include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> 33 34 #define RDSV3_FMR_SIZE 256 35 #define RDSV3_FMR_POOL_SIZE (12 * 1024) 36 37 #define RDSV3_IB_SEND_WRS 64 38 39 #define RDSV3_IB_MAX_SGE 8 40 #define RDSV3_IB_RECV_SGE 2 41 42 #define RDSV3_IB_DEFAULT_RECV_WR 1024 43 #define RDSV3_IB_DEFAULT_SEND_WR 256 44 45 #define RDSV3_IB_DEFAULT_RETRY_COUNT 2 46 47 /* minor versions supported */ 48 #define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003 49 50 extern struct list rdsv3_ib_devices; 51 52 /* 53 * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to 54 * try and minimize the amount of memory tied up both the device and 55 * socket receive queues. 56 */ 57 /* page offset of the final full frag that fits in the page */ 58 #define RDSV3_PAGE_LAST_OFF \ 59 (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE) 60 struct rdsv3_page_frag { 61 struct list_node f_item; 62 caddr_t f_page; 63 unsigned long f_offset; 64 ibt_wr_ds_t f_sge; 65 ibt_mi_hdl_t f_mapped; 66 }; 67 68 struct rdsv3_ib_incoming { 69 list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */ 70 struct list ii_frags; 71 struct rdsv3_incoming ii_inc; 72 struct rdsv3_inc_pool *ii_pool; 73 struct rdsv3_ib_device *ii_ibdev; 74 }; 75 76 struct rdsv3_ib_connect_private { 77 /* Add new fields at the end, and don't permute existing fields. */ 78 uint32_be_t dp_saddr; 79 uint32_be_t dp_daddr; 80 uint8_t dp_protocol_major; 81 uint8_t dp_protocol_minor; 82 uint16_be_t dp_protocol_minor_mask; /* bitmask */ 83 uint32_be_t dp_reserved1; 84 uint32_be_t dp_ack_seq; 85 uint32_be_t dp_credit; /* non-zero enables flow ctl */ 86 }; 87 88 struct rdsv3_ib_send_work { 89 struct rdsv3_message *s_rm; 90 struct rdsv3_rdma_op *s_op; 91 ibt_wrc_opcode_t s_opcode; 92 unsigned long s_queued; 93 }; 94 95 struct rdsv3_ib_recv_work { 96 struct rdsv3_ib_incoming *r_ibinc; 97 struct rdsv3_page_frag *r_frag; 98 ibt_wr_ds_t r_sge[2]; 99 }; 100 101 struct rdsv3_ib_work_ring { 102 uint32_t w_nr; 103 uint32_t w_alloc_ptr; 104 uint32_t w_alloc_ctr; 105 uint32_t w_free_ptr; 106 atomic_t w_free_ctr; 107 rdsv3_wait_queue_t w_empty_wait; 108 }; 109 110 /* 111 * Rings are posted with all the allocations they'll need to queue the 112 * incoming message to the receiving socket so this can't fail. 113 * All fragments start with a header, so we can make sure we're not receiving 114 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 115 */ 116 struct rdsv3_ib_ack_state { 117 uint64_t ack_next; 118 uint64_t ack_recv; 119 unsigned int ack_required:1; 120 unsigned int ack_next_valid:1; 121 unsigned int ack_recv_valid:1; 122 }; 123 124 struct rdsv3_ib_device; 125 126 struct rdsv3_ib_connection { 127 128 struct list_node ib_node; 129 boolean_t i_on_dev_list; 130 struct rdsv3_ib_device *rds_ibdev; 131 struct rdsv3_connection *conn; 132 133 /* alphabet soup, IBTA style */ 134 struct rdma_cm_id *i_cm_id; 135 struct ib_pd *i_pd; 136 struct rdsv3_hdrs_mr *i_mr; 137 struct ib_cq *i_cq; 138 struct ib_cq *i_snd_cq; 139 140 /* tx */ 141 struct rdsv3_ib_work_ring i_send_ring; 142 struct rdsv3_message *i_rm; 143 struct rdsv3_header *i_send_hdrs; 144 uint64_t i_send_hdrs_dma; 145 struct rdsv3_ib_send_work *i_sends; 146 ibt_send_wr_t *i_send_wrs; 147 148 /* soft CQ */ 149 rdsv3_af_thr_t *i_soft_cq; 150 rdsv3_af_thr_t *i_snd_soft_cq; 151 rdsv3_af_thr_t *i_refill_rq; 152 153 /* rx */ 154 struct mutex i_recv_mutex; 155 struct rdsv3_ib_work_ring i_recv_ring; 156 struct rdsv3_ib_incoming *i_ibinc; 157 uint32_t i_recv_data_rem; 158 struct rdsv3_header *i_recv_hdrs; 159 uint64_t i_recv_hdrs_dma; 160 struct rdsv3_ib_recv_work *i_recvs; 161 ibt_recv_wr_t *i_recv_wrs; 162 struct rdsv3_page_frag i_frag; 163 uint64_t i_ack_recv; /* last ACK received */ 164 165 /* sending acks */ 166 unsigned long i_ack_flags; 167 #ifdef KERNEL_HAS_ATOMIC64 168 atomic64_t i_ack_next; /* next ACK to send */ 169 #else 170 kmutex_t i_ack_lock; /* protect i_ack_next */ 171 uint64_t i_ack_next; /* next ACK to send */ 172 #endif 173 struct rdsv3_header *i_ack; 174 ibt_send_wr_t i_ack_wr; 175 ibt_wr_ds_t i_ack_sge; 176 uint64_t i_ack_dma; 177 unsigned long i_ack_queued; 178 179 /* 180 * Flow control related information 181 * 182 * Our algorithm uses a pair variables that we need to access 183 * atomically - one for the send credits, and one posted 184 * recv credits we need to transfer to remote. 185 * Rather than protect them using a slow spinlock, we put both into 186 * a single atomic_t and update it using cmpxchg 187 */ 188 atomic_t i_credits; 189 190 /* Protocol version specific information */ 191 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 192 193 /* Batched completions */ 194 unsigned int i_unsignaled_wrs; 195 long i_unsignaled_bytes; 196 }; 197 198 /* This assumes that atomic_t is at least 32 bits */ 199 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 200 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 201 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 202 #define IB_SET_POST_CREDITS(v) ((v) << 16) 203 204 struct rdsv3_ib_ipaddr { 205 struct list_node list; 206 uint32_be_t ipaddr; 207 }; 208 209 struct rdsv3_ib_device { 210 struct list_node list; 211 struct list ipaddr_list; 212 struct list conn_list; 213 ib_device_t *dev; 214 struct ib_pd *pd; 215 struct kmem_cache *ib_frag_slab; 216 kmutex_t spinlock; /* protect the above */ 217 krwlock_t rwlock; /* protect paddr_list */ 218 unsigned int fmr_max_remaps; 219 unsigned int max_fmrs; 220 unsigned int fmr_message_size; 221 int max_sge; 222 unsigned int max_wrs; 223 unsigned int max_initiator_depth; 224 unsigned int max_responder_resources; 225 struct rdsv3_fmr_pool *fmr_pool; 226 struct rdsv3_inc_pool *inc_pool; 227 ibt_fmr_pool_hdl_t fmr_pool_hdl; 228 ibt_hca_attr_t hca_attr; 229 rdsv3_af_thr_t *fmr_soft_cq; 230 rdsv3_af_thr_t *inc_soft_cq; 231 ibt_hca_hdl_t ibt_hca_hdl; 232 rdsv3_af_grp_t *aft_hcagp; 233 }; 234 235 /* bits for i_ack_flags */ 236 #define IB_ACK_IN_FLIGHT 0 237 #define IB_ACK_REQUESTED 1 238 239 #define RDSV3_IB_SEND_OP (1ULL << 63) 240 241 /* Magic WR_ID for ACKs */ 242 #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) 243 244 struct rdsv3_ib_statistics { 245 uint64_t s_ib_connect_raced; 246 uint64_t s_ib_listen_closed_stale; 247 uint64_t s_ib_evt_handler_call; 248 uint64_t s_ib_tasklet_call; 249 uint64_t s_ib_tx_cq_event; 250 uint64_t s_ib_tx_ring_full; 251 uint64_t s_ib_tx_throttle; 252 uint64_t s_ib_tx_sg_mapping_failure; 253 uint64_t s_ib_tx_stalled; 254 uint64_t s_ib_tx_credit_updates; 255 uint64_t s_ib_rx_cq_event; 256 uint64_t s_ib_rx_ring_empty; 257 uint64_t s_ib_rx_refill_from_cq; 258 uint64_t s_ib_rx_refill_from_thread; 259 uint64_t s_ib_rx_alloc_limit; 260 uint64_t s_ib_rx_credit_updates; 261 uint64_t s_ib_ack_sent; 262 uint64_t s_ib_ack_send_failure; 263 uint64_t s_ib_ack_send_delayed; 264 uint64_t s_ib_ack_send_piggybacked; 265 uint64_t s_ib_ack_received; 266 uint64_t s_ib_rdma_mr_alloc; 267 uint64_t s_ib_rdma_mr_free; 268 uint64_t s_ib_rdma_mr_used; 269 uint64_t s_ib_rdma_mr_pool_flush; 270 uint64_t s_ib_rdma_mr_pool_wait; 271 uint64_t s_ib_rdma_mr_pool_depleted; 272 }; 273 274 extern struct rdsv3_workqueue_struct_s *rds_ib_wq; 275 276 /* ib.c */ 277 extern struct rdsv3_transport rdsv3_ib_transport; 278 extern void rdsv3_ib_add_one(ib_device_t *device); 279 extern void rdsv3_ib_remove_one(ib_device_t *device); 280 extern struct ib_client rdsv3_ib_client; 281 282 extern unsigned int fmr_pool_size; 283 extern unsigned int fmr_message_size; 284 extern unsigned int rdsv3_ib_retry_count; 285 286 extern kmutex_t ib_nodev_conns_lock; 287 extern struct list ib_nodev_conns; 288 289 /* ib_cm.c */ 290 int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp); 291 void rdsv3_ib_conn_free(void *arg); 292 int rdsv3_ib_conn_connect(struct rdsv3_connection *conn); 293 void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn); 294 void rdsv3_conn_drop(struct rdsv3_connection *conn); 295 int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 296 struct rdma_cm_event *event); 297 int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 298 void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 299 struct rdma_cm_event *event); 300 void rdsv3_ib_tasklet_fn(void *data); 301 void rdsv3_ib_snd_tasklet_fn(void *data); 302 void rdsv3_ib_refill_fn(void *data); 303 304 /* ib_rdma.c */ 305 int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, 306 uint32_be_t ipaddr); 307 void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, 308 struct rdsv3_connection *conn); 309 void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, 310 struct rdsv3_connection *conn); 311 void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock); 312 static inline void rdsv3_ib_destroy_nodev_conns(void) 313 { 314 __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); 315 } 316 static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev) 317 { 318 __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); 319 } 320 321 int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *); 322 void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *); 323 void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, 324 struct rdsv3_info_rdma_connection *iinfo); 325 void *rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, 326 struct rdsv3_sock *rs, uint32_t *key_ret); 327 void rdsv3_ib_sync_mr(void *trans_private, int dir); 328 void rdsv3_ib_free_mr(void *trans_private, int invalidate); 329 void rdsv3_ib_flush_mrs(void); 330 void rdsv3_ib_drain_mrlist_fn(void *data); 331 332 /* ib_recv.c */ 333 int rdsv3_ib_recv_init(void); 334 void rdsv3_ib_recv_exit(void); 335 int rdsv3_ib_recv(struct rdsv3_connection *conn); 336 int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill); 337 void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); 338 int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, 339 size_t size); 340 void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, 341 struct rdsv3_ib_ack_state *state); 342 void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); 343 void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); 344 void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); 345 void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); 346 void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); 347 uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); 348 void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, 349 int ack_required); 350 int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *); 351 void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *); 352 void rdsv3_ib_drain_inclist(void *); 353 354 /* ib_ring.c */ 355 void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); 356 void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr); 357 uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, 358 uint32_t *pos); 359 void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val); 360 void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val); 361 int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring); 362 int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); 363 uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); 364 uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, 365 uint32_t wr_id, uint32_t oldest); 366 367 /* ib_send.c */ 368 void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); 369 int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 370 unsigned int hdr_off, unsigned int sg, unsigned int off); 371 void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc); 372 void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); 373 void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); 374 int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); 375 void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, 376 unsigned int credits); 377 void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, 378 unsigned int posted); 379 int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted, 380 uint32_t *adv_credits, int need_posted); 381 382 /* ib_stats.c */ 383 RDSV3_DECLARE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats); 384 #define rdsv3_ib_stats_inc(member) rdsv3_stats_inc_which(rdsv3_ib_stats, member) 385 unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, 386 unsigned int avail); 387 388 /* ib_sysctl.c */ 389 int rdsv3_ib_sysctl_init(void); 390 void rdsv3_ib_sysctl_exit(void); 391 extern unsigned long rdsv3_ib_sysctl_max_send_wr; 392 extern unsigned long rdsv3_ib_sysctl_max_recv_wr; 393 extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs; 394 extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes; 395 extern unsigned long rdsv3_ib_sysctl_max_recv_allocation; 396 extern unsigned int rdsv3_ib_sysctl_flow_control; 397 398 #endif /* _RDSV3_IB_H */ 399