24,25c24 < * Changes: Ira Burton : Support for SOCK_CLUSTER < * Pedro Roque : Retransmit queue handled by TCP. --- > * Changes: Pedro Roque : Retransmit queue handled by TCP. 298,358d296 < /* Ira Burton < * Builds the tcp header, queue the skb for transmission, and < * updates the TCP window. No support for SACKs because they < * should not happen on a cluster. < * < * We are working here with either a clone of the original < * SKB, or a fresh unique copy made by the retransmit engine. < */ < __inline__ int cluster_transmit_skb(struct sock *sk, struct sk_buff *skb) < { < //skb must be checked because it was just cloned < if(skb != NULL) { < struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); < struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); < struct tcphdr *th; < int err; < < th = (struct tcphdr *) skb_push(skb, sizeof(struct tcphdr)); < skb->h.th = th; < skb_set_owner_w(skb, sk); < < /* Build TCP header and checksum it. */ < th->source = sk->sport; < th->dest = sk->dport; < th->seq = htonl(tcb->seq); < th->ack_seq = htonl(tp->rcv_nxt); < *(((__u16 *)th) + 6) = htons(((sizeof(struct tcphdr) >> 2) << 12) | tcb->flags); < th->window = htons(tcp_select_window(sk)); < th->check = 0; < th->urg_ptr = 0; < skb->csum = 0; < skb->ip_summed = CHECKSUM_UNNECESSARY; < < TCP_ECN_send(sk, tp, skb, sizeof(struct tcphdr)); < < if (tcb->flags & TCPCB_FLAG_ACK) < tcp_event_ack_sent(sk); < < if (skb->len != sizeof(struct tcphdr)) < tcp_event_data_sent(tp, skb); < < < TCP_INC_STATS(TcpOutSegs); < < err = cluster_ip_queue_xmit(skb); < if (err <= 0) < return err; < < tcp_enter_cwr(tp); < < /* NET_XMIT_CN is special. It does not guarantee, < * that this packet is lost. It tells that device < * is about to start to drop packets or already < * drops some packets of the same priority and < * invokes us to send less aggressively. < */ < return err == NET_XMIT_CN ? 0 : err; < } < return -ENOBUFS; < < } 412,435d349 < /* Ira Burton < * Send _single_ skb sitting at the send head. This function requires < * true push pending frames to setup probe timer etc. Identical to < * TCP code except calls cluster functions. < */ < inline void cluster_push_one(struct sock *sk, unsigned cur_mss) < { < struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); < struct sk_buff *skb = tp->send_head; < < if (tcp_snd_test(tp, skb, cur_mss, 1)) { < /* Send it out now. */ < TCP_SKB_CB(skb)->when = tcp_time_stamp; < if (cluster_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { < tp->send_head = NULL; < tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; < if (tp->packets_out++ == 0) < tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); < return; < } < } < } < < 565,623d478 < /* Ira Burton < * Function to create two new TCP segments. Shrinks the given segment < * to the specified size and appends a new segment with the rest of the < * packet to the list. This won't be called frequently, I hope. < * Remember, these are still headerless SKBs at this point. This is indentical < * to the tcp code, except it calls cluster functions. < */ < static __inline__ int cluster_fragment(struct sock *sk, struct sk_buff *skb, u32 len) < { < struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; < struct sk_buff *buff; < int nsize = skb->len - len; < u16 flags; < < if (skb_cloned(skb) && < skb_is_nonlinear(skb) && < pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) < return -ENOMEM; < < /* Get a new skb... force flag on. */ < buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC); < if (buff == NULL) < return -ENOMEM; /* We'll just try again later. */ < tcp_charge_skb(sk, buff); < < /* Correct the sequence numbers. */ < TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; < TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; < TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; < < /* PSH and FIN should only be set in the second packet. */ < flags = TCP_SKB_CB(skb)->flags; < TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); < TCP_SKB_CB(buff)->flags = flags; < TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); < if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { < tp->lost_out++; < tp->left_out++; < } < TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; < < < < buff->csum = 0; < skb->csum = 0; < buff->ip_summed = skb->ip_summed = CHECKSUM_UNNECESSARY; < < /* Looks stupid, but our code really uses when of < * skbs, which it never sent before. --ANK < */ < TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; < < /* Link BUFF into the send queue. */ < __skb_append(skb, buff); < < return 0; < } < < 736,781d590 < /* Ira Burton < * Sends the packets to be transmitted as long as our window has not < * changed. Identical to TCP code, excpet calls cluster functions. < */ < inline int cluster_write_xmit(struct sock *sk, int nonagle) < { < struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); < unsigned int mss_now; < < /* If we are closed, the bytes will have to remain here. < * * In time closedown will finish, we empty the write queue and all < * * will be happy. < * */ < if(sk->state != TCP_CLOSE) { < struct sk_buff *skb; < int sent_pkts = 0; < < mss_now = tcp_current_mss(sk); < < while((skb = tp->send_head) && < tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) { < if (skb->len > mss_now) { < if (cluster_fragment(sk, skb, mss_now)) < break; < } < < TCP_SKB_CB(skb)->when = tcp_time_stamp; < if (cluster_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) < break; < /* Advance the send_head. This one is sent out. */ < update_send_head(sk, tp, skb); < tcp_minshall_update(tp, mss_now, skb); < sent_pkts = 1; < } < < if (sent_pkts) { < tcp_cwnd_validate(sk, tp); < return 0; < } < < return !tp->packets_out && tp->send_head; < } < return 0; < } < < 1489,1545d1297 < /* Ira Burton < * Send out a delayed ack, the caller does the policy checking < * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() < * for details. This is identical to the TCP code except it calls cluster < * functions. < */ < inline void cluster_send_delayed_ack(struct sock *sk) < { < struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; < int ato = tp->ack.ato; < unsigned long timeout; < < if (ato > TCP_DELACK_MIN) { < int max_ato = HZ/2; < < if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) < max_ato = TCP_DELACK_MAX; < < /* Slow path, intersegment interval is "high". */ < < /* If some rtt estimate is known, use it to bound delayed ack. < * Do not use tp->rto here, use results of rtt measurements < * directly. < */ < if (tp->srtt) { < int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); < < if (rtt < max_ato) < max_ato = rtt; < } < < ato = min(ato, max_ato); < } < < /* Stay within the limit we were given */ < timeout = jiffies + ato; < < /* Use new timeout only if there wasn't a older one earlier. */ < if (tp->ack.pending&TCP_ACK_TIMER) { < /* If delack timer was blocked or is about to expire, < * send ACK now. < */ < if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { < cluster_send_ack(sk); < return; < } < < if (!time_before(timeout, tp->ack.timeout)) < timeout = tp->ack.timeout; < } < tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; < tp->ack.timeout = timeout; < if (!mod_timer(&tp->delack_timer, timeout)) < sock_hold(sk); < } < < 1576,1611d1327 < } < } < < /* Ira Burton < * This routine sends an ack and also updates the window. Identical to TCP code < * excpet calls cluster functions, and doesn't use a checksum. < */ < inline void cluster_send_ack(struct sock *sk) < { < /* If we have been reset, we may not send again. */ < if(sk->state != TCP_CLOSE) { < struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); < struct sk_buff *buff; < < /* We are not putting this on the write queue, so < * tcp_transmit_skb() will set the ownership to this < * sock. < */ < buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); < if (buff == NULL) { < tcp_schedule_ack(tp); < tp->ack.ato = TCP_ATO_MIN; < tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); < return; < } < < /* Reserve space for headers and prepare control bits. */ < skb_reserve(buff, MAX_TCP_HEADER); < buff->csum = 0; < TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; < TCP_SKB_CB(buff)->sacked = 0; < < /* Send it off, this clears delayed acks for us. */ < TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); < TCP_SKB_CB(buff)->when = tcp_time_stamp; < cluster_transmit_skb(sk, buff);