brcm2708: update 3.10 patches with raspberrypi/rpi-3.10.y of 27 Apr. 2014
[openwrt.git] / target / linux / brcm2708 / patches-3.10 / 0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
1 From 370c8243ec8e7f3abd8171b7d2dde170f4c5e63a Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
3 Date: Mon, 17 Jun 2013 16:00:25 +0300
4 Subject: [PATCH 070/196] bcm2708_fb: DMA acceleration for fb_copyarea
5
6 Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
7 Also used Simon's dmaer_master module as a reference for tweaking DMA
8 settings for better performance.
9
10 For now busylooping only. IRQ support might be added later.
11 With non-overclocked Raspberry Pi, the performance is ~360 MB/s
12 for simple copy or ~260 MB/s for two-pass copy (used when dragging
13 windows to the right).
14
15 In the case of using DMA channel 0, the performance improves
16 to ~440 MB/s.
17
18 For comparison, VFP optimized CPU copy can only do ~114 MB/s in
19 the same conditions (hindered by reading uncached source buffer).
20
21 Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
22 ---
23  drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
24  1 file changed, 159 insertions(+), 3 deletions(-)
25
26 diff --git a/drivers/video/bcm2708_fb.c b/drivers/video/bcm2708_fb.c
27 index 08d9238..c10c5ee 100644
28 --- a/drivers/video/bcm2708_fb.c
29 +++ b/drivers/video/bcm2708_fb.c
30 @@ -28,6 +28,7 @@
31  #include <linux/printk.h>
32  #include <linux/console.h>
33  
34 +#include <mach/dma.h>
35  #include <mach/platform.h>
36  #include <mach/vcio.h>
37  
38 @@ -63,6 +64,11 @@ struct bcm2708_fb {
39         struct fbinfo_s *info;
40         dma_addr_t dma;
41         u32 cmap[16];
42 +       int dma_chan;
43 +       int dma_irq;
44 +       void __iomem *dma_chan_base;
45 +       void *cb_base;          /* DMA control blocks */
46 +       dma_addr_t cb_handle;
47  };
48  
49  #define to_bcm2708(info)       container_of(info, struct bcm2708_fb, fb)
50 @@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct fb_info *info,
51         cfb_fillrect(info, rect);
52  }
53  
54 +/* A helper function for configuring dma control block */
55 +static void set_dma_cb(struct bcm2708_dma_cb *cb,
56 +                      int        burst_size,
57 +                      dma_addr_t dst,
58 +                      int        dst_stride,
59 +                      dma_addr_t src,
60 +                      int        src_stride,
61 +                      int        w,
62 +                      int        h)
63 +{
64 +       cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
65 +                  BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
66 +                  BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
67 +       cb->dst = dst;
68 +       cb->src = src;
69 +       /*
70 +        * This is not really obvious from the DMA documentation,
71 +        * but the top 16 bits must be programmmed to "height -1"
72 +        * and not "height" in 2D mode.
73 +        */
74 +       cb->length = ((h - 1) << 16) | w;
75 +       cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
76 +       cb->pad[0] = 0;
77 +       cb->pad[1] = 0;
78 +}
79 +
80  static void bcm2708_fb_copyarea(struct fb_info *info,
81                                 const struct fb_copyarea *region)
82  {
83 -       /*print_debug("bcm2708_fb_copyarea\n"); */
84 -       cfb_copyarea(info, region);
85 +       struct bcm2708_fb *fb = to_bcm2708(info);
86 +       struct bcm2708_dma_cb *cb = fb->cb_base;
87 +       int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
88 +       /* Channel 0 supports larger bursts and is a bit faster */
89 +       int burst_size = (fb->dma_chan == 0) ? 8 : 2;
90 +
91 +       /* Fallback to cfb_copyarea() if we don't like something */
92 +       if (bytes_per_pixel > 4 ||
93 +           info->var.xres > 1920 || info->var.yres > 1200 ||
94 +           region->width <= 0 || region->width > info->var.xres ||
95 +           region->height <= 0 || region->height > info->var.yres ||
96 +           region->sx < 0 || region->sx >= info->var.xres ||
97 +           region->sy < 0 || region->sy >= info->var.yres ||
98 +           region->dx < 0 || region->dx >= info->var.xres ||
99 +           region->dy < 0 || region->dy >= info->var.yres ||
100 +           region->sx + region->width > info->var.xres ||
101 +           region->dx + region->width > info->var.xres ||
102 +           region->sy + region->height > info->var.yres ||
103 +           region->dy + region->height > info->var.yres) {
104 +               cfb_copyarea(info, region);
105 +               return;
106 +       }
107 +
108 +       if (region->dy == region->sy && region->dx > region->sx) {
109 +               /*
110 +                * A difficult case of overlapped copy. Because DMA can't
111 +                * copy individual scanlines in backwards direction, we need
112 +                * two-pass processing. We do it by programming a chain of dma
113 +                * control blocks in the first 16K part of the buffer and use
114 +                * the remaining 48K as the intermediate temporary scratch
115 +                * buffer. The buffer size is sufficient to handle up to
116 +                * 1920x1200 resolution at 32bpp pixel depth.
117 +                */
118 +               int y;
119 +               dma_addr_t control_block_pa = fb->cb_handle;
120 +               dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
121 +               int scanline_size = bytes_per_pixel * region->width;
122 +               int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
123 +
124 +               for (y = 0; y < region->height; y += scanlines_per_cb) {
125 +                       dma_addr_t src =
126 +                               fb->fb.fix.smem_start +
127 +                               bytes_per_pixel * region->sx +
128 +                               (region->sy + y) * fb->fb.fix.line_length;
129 +                       dma_addr_t dst =
130 +                               fb->fb.fix.smem_start +
131 +                               bytes_per_pixel * region->dx +
132 +                               (region->dy + y) * fb->fb.fix.line_length;
133 +
134 +                       if (region->height - y < scanlines_per_cb)
135 +                               scanlines_per_cb = region->height - y;
136 +
137 +                       set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
138 +                                  src, fb->fb.fix.line_length,
139 +                                  scanline_size, scanlines_per_cb);
140 +                       control_block_pa += sizeof(struct bcm2708_dma_cb);
141 +                       cb->next = control_block_pa;
142 +                       cb++;
143 +
144 +                       set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
145 +                                  scratchbuf, scanline_size,
146 +                                  scanline_size, scanlines_per_cb);
147 +                       control_block_pa += sizeof(struct bcm2708_dma_cb);
148 +                       cb->next = control_block_pa;
149 +                       cb++;
150 +               }
151 +               /* move the pointer back to the last dma control block */
152 +               cb--;
153 +       } else {
154 +               /* A single dma control block is enough. */
155 +               int sy, dy, stride;
156 +               if (region->dy <= region->sy) {
157 +                       /* processing from top to bottom */
158 +                       dy = region->dy;
159 +                       sy = region->sy;
160 +                       stride = fb->fb.fix.line_length;
161 +               } else {
162 +                       /* processing from bottom to top */
163 +                       dy = region->dy + region->height - 1;
164 +                       sy = region->sy + region->height - 1;
165 +                       stride = -fb->fb.fix.line_length;
166 +               }
167 +               set_dma_cb(cb, burst_size,
168 +                          fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
169 +                                                  bytes_per_pixel * region->dx,
170 +                          stride,
171 +                          fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
172 +                                                  bytes_per_pixel * region->sx,
173 +                          stride,
174 +                          region->width * bytes_per_pixel,
175 +                          region->height);
176 +       }
177 +
178 +       /* end of dma control blocks chain */
179 +       cb->next = 0;
180 +
181 +       bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
182 +       bcm_dma_wait_idle(fb->dma_chan_base);
183  }
184  
185  static void bcm2708_fb_imageblit(struct fb_info *info,
186 @@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bcm2708_fb *fb)
187                 fb->dma = dma;
188         }
189         fb->fb.fbops = &bcm2708_fb_ops;
190 -       fb->fb.flags = FBINFO_FLAG_DEFAULT;
191 +       fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
192         fb->fb.pseudo_palette = fb->cmap;
193  
194         strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
195 @@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platform_device *dev)
196         }
197         memset(fb, 0, sizeof(struct bcm2708_fb));
198  
199 +       fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
200 +                                            &fb->cb_handle, GFP_KERNEL);
201 +       if (!fb->cb_base) {
202 +               dev_err(&dev->dev, "cannot allocate DMA CBs\n");
203 +               ret = -ENOMEM;
204 +               goto free_fb;
205 +       }
206 +
207 +       pr_info("BCM2708FB: allocated DMA memory %08x\n",
208 +              fb->cb_handle);
209 +
210 +       ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
211 +                                &fb->dma_chan_base, &fb->dma_irq);
212 +       if (ret < 0) {
213 +               dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
214 +               goto free_cb;
215 +       }
216 +       fb->dma_chan = ret;
217 +
218 +       pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
219 +              fb->dma_chan, fb->dma_chan_base);
220 +
221         fb->dev = dev;
222  
223         ret = bcm2708_fb_register(fb);
224 @@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platform_device *dev)
225                 goto out;
226         }
227  
228 +free_cb:
229 +       dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
230 +free_fb:
231         kfree(fb);
232  free_region:
233         dev_err(&dev->dev, "probe failed, err %d\n", ret);
234 @@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct platform_device *dev)
235                 iounmap(fb->fb.screen_base);
236         unregister_framebuffer(&fb->fb);
237  
238 +       dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
239 +       bcm_dma_chan_free(fb->dma_chan);
240 +
241         dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
242                           fb->dma);
243         kfree(fb);
244 -- 
245 1.9.1
246