1 /* 2 * This file contains definitions imported from the OFED rds header ib.h. 3 * Oracle elects to have and use the contents of ib.h under and 4 * governed by the OpenIB.org BSD license. 5 */ 6 /* 7 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 8 */ 9 10 #ifndef _RDSV3_IB_H 11 #define _RDSV3_IB_H 12 13 #include <sys/rds.h> 14 #include <sys/ib/clients/rdsv3/rdsv3.h> 15 #include <sys/ib/clients/rdsv3/rdma_transport.h> 16 #include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> 17 18 #define RDSV3_FMR_SIZE 256 19 #define RDSV3_FMR_POOL_SIZE (12 * 1024) 20 21 #define RDSV3_IB_SEND_WRS 64 22 23 #define RDSV3_IB_MAX_SGE 8 24 #define RDSV3_IB_RECV_SGE 2 25 26 #define RDSV3_IB_DEFAULT_RECV_WR 1024 27 #define RDSV3_IB_DEFAULT_SEND_WR 256 28 29 #define RDSV3_IB_DEFAULT_RETRY_COUNT 2 30 31 /* minor versions supported */ 32 #define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003 33 34 extern struct list rdsv3_ib_devices; 35 36 /* 37 * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to 38 * try and minimize the amount of memory tied up both the device and 39 * socket receive queues. 40 */ 41 /* page offset of the final full frag that fits in the page */ 42 #define RDSV3_PAGE_LAST_OFF \ 43 (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE) 44 struct rdsv3_page_frag { 45 struct list_node f_item; 46 caddr_t f_page; 47 unsigned long f_offset; 48 ibt_wr_ds_t f_sge; 49 ibt_mi_hdl_t f_mapped; 50 }; 51 52 struct rdsv3_ib_incoming { 53 list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */ 54 struct list ii_frags; 55 struct rdsv3_incoming ii_inc; 56 struct rdsv3_inc_pool *ii_pool; 57 struct rdsv3_ib_device *ii_ibdev; 58 }; 59 60 struct rdsv3_ib_connect_private { 61 /* Add new fields at the end, and don't permute existing fields. */ 62 uint32_be_t dp_saddr; 63 uint32_be_t dp_daddr; 64 uint8_t dp_protocol_major; 65 uint8_t dp_protocol_minor; 66 uint16_be_t dp_protocol_minor_mask; /* bitmask */ 67 uint32_be_t dp_reserved1; 68 uint32_be_t dp_ack_seq; 69 uint32_be_t dp_credit; /* non-zero enables flow ctl */ 70 }; 71 72 struct rdsv3_ib_send_work { 73 struct rdsv3_message *s_rm; 74 struct rdsv3_rdma_op *s_op; 75 ibt_wrc_opcode_t s_opcode; 76 unsigned long s_queued; 77 }; 78 79 struct rdsv3_ib_recv_work { 80 struct rdsv3_ib_incoming *r_ibinc; 81 struct rdsv3_page_frag *r_frag; 82 ibt_wr_ds_t r_sge[2]; 83 }; 84 85 struct rdsv3_ib_work_ring { 86 uint32_t w_nr; 87 uint32_t w_alloc_ptr; 88 uint32_t w_alloc_ctr; 89 uint32_t w_free_ptr; 90 atomic_t w_free_ctr; 91 rdsv3_wait_queue_t w_empty_wait; 92 }; 93 94 /* 95 * Rings are posted with all the allocations they'll need to queue the 96 * incoming message to the receiving socket so this can't fail. 97 * All fragments start with a header, so we can make sure we're not receiving 98 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 99 */ 100 struct rdsv3_ib_ack_state { 101 uint64_t ack_next; 102 uint64_t ack_recv; 103 unsigned int ack_required:1; 104 unsigned int ack_next_valid:1; 105 unsigned int ack_recv_valid:1; 106 }; 107 108 struct rdsv3_ib_device; 109 110 struct rdsv3_ib_connection { 111 112 struct list_node ib_node; 113 boolean_t i_on_dev_list; 114 struct rdsv3_ib_device *rds_ibdev; 115 struct rdsv3_connection *conn; 116 117 /* alphabet soup, IBTA style */ 118 struct rdma_cm_id *i_cm_id; 119 struct ib_pd *i_pd; 120 struct rdsv3_hdrs_mr *i_mr; 121 struct ib_cq *i_cq; 122 struct ib_cq *i_snd_cq; 123 124 /* tx */ 125 struct rdsv3_ib_work_ring i_send_ring; 126 struct rdsv3_message *i_rm; 127 struct rdsv3_header *i_send_hdrs; 128 uint64_t i_send_hdrs_dma; 129 struct rdsv3_ib_send_work *i_sends; 130 ibt_send_wr_t *i_send_wrs; 131 132 /* soft CQ */ 133 rdsv3_af_thr_t *i_soft_cq; 134 rdsv3_af_thr_t *i_snd_soft_cq; 135 rdsv3_af_thr_t *i_refill_rq; 136 137 /* rx */ 138 struct mutex i_recv_mutex; 139 struct rdsv3_ib_work_ring i_recv_ring; 140 struct rdsv3_ib_incoming *i_ibinc; 141 uint32_t i_recv_data_rem; 142 struct rdsv3_header *i_recv_hdrs; 143 uint64_t i_recv_hdrs_dma; 144 struct rdsv3_ib_recv_work *i_recvs; 145 ibt_recv_wr_t *i_recv_wrs; 146 struct rdsv3_page_frag i_frag; 147 uint64_t i_ack_recv; /* last ACK received */ 148 149 /* sending acks */ 150 unsigned long i_ack_flags; 151 #ifdef KERNEL_HAS_ATOMIC64 152 atomic64_t i_ack_next; /* next ACK to send */ 153 #else 154 kmutex_t i_ack_lock; /* protect i_ack_next */ 155 uint64_t i_ack_next; /* next ACK to send */ 156 #endif 157 struct rdsv3_header *i_ack; 158 ibt_send_wr_t i_ack_wr; 159 ibt_wr_ds_t i_ack_sge; 160 uint64_t i_ack_dma; 161 unsigned long i_ack_queued; 162 163 /* 164 * Flow control related information 165 * 166 * Our algorithm uses a pair variables that we need to access 167 * atomically - one for the send credits, and one posted 168 * recv credits we need to transfer to remote. 169 * Rather than protect them using a slow spinlock, we put both into 170 * a single atomic_t and update it using cmpxchg 171 */ 172 atomic_t i_credits; 173 174 /* Protocol version specific information */ 175 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 176 177 /* Batched completions */ 178 unsigned int i_unsignaled_wrs; 179 long i_unsignaled_bytes; 180 }; 181 182 /* This assumes that atomic_t is at least 32 bits */ 183 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 184 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 185 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 186 #define IB_SET_POST_CREDITS(v) ((v) << 16) 187 188 struct rdsv3_ib_ipaddr { 189 struct list_node list; 190 uint32_be_t ipaddr; 191 }; 192 193 struct rdsv3_ib_device { 194 struct list_node list; 195 struct list ipaddr_list; 196 struct list conn_list; 197 ib_device_t *dev; 198 struct ib_pd *pd; 199 struct kmem_cache *ib_frag_slab; 200 kmutex_t spinlock; /* protect the above */ 201 krwlock_t rwlock; /* protect paddr_list */ 202 unsigned int fmr_max_remaps; 203 unsigned int max_fmrs; 204 unsigned int fmr_message_size; 205 int max_sge; 206 unsigned int max_wrs; 207 unsigned int max_initiator_depth; 208 unsigned int max_responder_resources; 209 struct rdsv3_fmr_pool *fmr_pool; 210 struct rdsv3_inc_pool *inc_pool; 211 ibt_fmr_pool_hdl_t fmr_pool_hdl; 212 ibt_hca_attr_t hca_attr; 213 rdsv3_af_thr_t *fmr_soft_cq; 214 rdsv3_af_thr_t *inc_soft_cq; 215 ibt_hca_hdl_t ibt_hca_hdl; 216 rdsv3_af_grp_t *aft_hcagp; 217 }; 218 219 /* bits for i_ack_flags */ 220 #define IB_ACK_IN_FLIGHT 0 221 #define IB_ACK_REQUESTED 1 222 223 #define RDSV3_IB_SEND_OP (1ULL << 63) 224 225 /* Magic WR_ID for ACKs */ 226 #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) 227 228 struct rdsv3_ib_statistics { 229 uint64_t s_ib_connect_raced; 230 uint64_t s_ib_listen_closed_stale; 231 uint64_t s_ib_evt_handler_call; 232 uint64_t s_ib_tasklet_call; 233 uint64_t s_ib_tx_cq_event; 234 uint64_t s_ib_tx_ring_full; 235 uint64_t s_ib_tx_throttle; 236 uint64_t s_ib_tx_sg_mapping_failure; 237 uint64_t s_ib_tx_stalled; 238 uint64_t s_ib_tx_credit_updates; 239 uint64_t s_ib_rx_cq_event; 240 uint64_t s_ib_rx_ring_empty; 241 uint64_t s_ib_rx_refill_from_cq; 242 uint64_t s_ib_rx_refill_from_thread; 243 uint64_t s_ib_rx_alloc_limit; 244 uint64_t s_ib_rx_credit_updates; 245 uint64_t s_ib_ack_sent; 246 uint64_t s_ib_ack_send_failure; 247 uint64_t s_ib_ack_send_delayed; 248 uint64_t s_ib_ack_send_piggybacked; 249 uint64_t s_ib_ack_received; 250 uint64_t s_ib_rdma_mr_alloc; 251 uint64_t s_ib_rdma_mr_free; 252 uint64_t s_ib_rdma_mr_used; 253 uint64_t s_ib_rdma_mr_pool_flush; 254 uint64_t s_ib_rdma_mr_pool_wait; 255 uint64_t s_ib_rdma_mr_pool_depleted; 256 }; 257 258 extern struct rdsv3_workqueue_struct_s *rds_ib_wq; 259 260 /* ib.c */ 261 extern struct rdsv3_transport rdsv3_ib_transport; 262 extern void rdsv3_ib_add_one(ib_device_t *device); 263 extern void rdsv3_ib_remove_one(ib_device_t *device); 264 extern struct ib_client rdsv3_ib_client; 265 266 extern unsigned int fmr_pool_size; 267 extern unsigned int fmr_message_size; 268 extern unsigned int rdsv3_ib_retry_count; 269 270 extern kmutex_t ib_nodev_conns_lock; 271 extern struct list ib_nodev_conns; 272 273 /* ib_cm.c */ 274 int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp); 275 void rdsv3_ib_conn_free(void *arg); 276 int rdsv3_ib_conn_connect(struct rdsv3_connection *conn); 277 void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn); 278 void rdsv3_conn_drop(struct rdsv3_connection *conn); 279 int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 280 struct rdma_cm_event *event); 281 int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 282 void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 283 struct rdma_cm_event *event); 284 void rdsv3_ib_tasklet_fn(void *data); 285 void rdsv3_ib_snd_tasklet_fn(void *data); 286 void rdsv3_ib_refill_fn(void *data); 287 288 /* ib_rdma.c */ 289 int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, 290 uint32_be_t ipaddr); 291 void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, 292 struct rdsv3_connection *conn); 293 void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, 294 struct rdsv3_connection *conn); 295 void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock); 296 static inline void rdsv3_ib_destroy_nodev_conns(void) 297 { 298 __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); 299 } 300 static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev) 301 { 302 __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); 303 } 304 305 int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *); 306 void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *); 307 void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, 308 struct rdsv3_info_rdma_connection *iinfo); 309 void *rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, 310 struct rdsv3_sock *rs, uint32_t *key_ret); 311 void rdsv3_ib_sync_mr(void *trans_private, int dir); 312 void rdsv3_ib_free_mr(void *trans_private, int invalidate); 313 void rdsv3_ib_flush_mrs(void); 314 void rdsv3_ib_drain_mrlist_fn(void *data); 315 316 /* ib_recv.c */ 317 int rdsv3_ib_recv_init(void); 318 void rdsv3_ib_recv_exit(void); 319 int rdsv3_ib_recv(struct rdsv3_connection *conn); 320 int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill); 321 void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); 322 int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, 323 size_t size); 324 void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, 325 struct rdsv3_ib_ack_state *state); 326 void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); 327 void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); 328 void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); 329 void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); 330 void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); 331 uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); 332 void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, 333 int ack_required); 334 int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *); 335 void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *); 336 void rdsv3_ib_drain_inclist(void *); 337 338 /* ib_ring.c */ 339 void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); 340 void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr); 341 uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, 342 uint32_t *pos); 343 void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val); 344 void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val); 345 int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring); 346 int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); 347 uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); 348 uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, 349 uint32_t wr_id, uint32_t oldest); 350 351 /* ib_send.c */ 352 void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); 353 int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 354 unsigned int hdr_off, unsigned int sg, unsigned int off); 355 void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc); 356 void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); 357 void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); 358 int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); 359 void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, 360 unsigned int credits); 361 void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, 362 unsigned int posted); 363 int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted, 364 uint32_t *adv_credits, int need_posted); 365 366 /* ib_stats.c */ 367 RDSV3_DECLARE_PER_CPU(struct rdsv3_ib_statistics, rdsv3_ib_stats); 368 #define rdsv3_ib_stats_inc(member) rdsv3_stats_inc_which(rdsv3_ib_stats, member) 369 unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, 370 unsigned int avail); 371 372 /* ib_sysctl.c */ 373 int rdsv3_ib_sysctl_init(void); 374 void rdsv3_ib_sysctl_exit(void); 375 extern unsigned long rdsv3_ib_sysctl_max_send_wr; 376 extern unsigned long rdsv3_ib_sysctl_max_recv_wr; 377 extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs; 378 extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes; 379 extern unsigned long rdsv3_ib_sysctl_max_recv_allocation; 380 extern unsigned int rdsv3_ib_sysctl_flow_control; 381 382 #endif /* _RDSV3_IB_H */ 383