diff gcc/config/i386/x86-tune-costs.h @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/i386/x86-tune-costs.h	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/i386/x86-tune-costs.h	Thu Oct 25 07:37:49 2018 +0900
@@ -1,5 +1,5 @@
 /* Costs of operations of individual x86 CPUs.
-   Copyright (C) 1988-2017 Free Software Foundation, Inc.
+   Copyright (C) 1988-2018 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -111,6 +111,10 @@
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
+  NULL,					/* Loop alignment.  */
+  NULL,					/* Jump alignment.  */
+  NULL,					/* Label alignment.  */
+  NULL,					/* Func alignment.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -197,6 +201,10 @@
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "4",					/* Loop alignment.  */
+  "4",					/* Jump alignment.  */
+  NULL,					/* Label alignment.  */
+  "4",					/* Func alignment.  */
 };
 
 static stringop_algs i486_memcpy[2] = {
@@ -284,6 +292,10 @@
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16",					/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 static stringop_algs pentium_memcpy[2] = {
@@ -369,6 +381,10 @@
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:8:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 static const
@@ -447,6 +463,10 @@
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:8:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -540,6 +560,10 @@
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 static stringop_algs geode_memcpy[2] = {
@@ -625,6 +649,10 @@
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  NULL,					/* Loop alignment.  */
+  NULL,					/* Jump alignment.  */
+  NULL,					/* Label alignment.  */
+  NULL,					/* Func alignment.  */
 };
 
 static stringop_algs k6_memcpy[2] = {
@@ -712,6 +740,10 @@
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "32:8:8",				/* Loop alignment.  */
+  "32:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "32",					/* Func alignment.  */
 };
 
 /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -763,12 +795,12 @@
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 6, 12, 24},			/* cost of loading SSE registers
+  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+  {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
@@ -800,6 +832,10 @@
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:8:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -855,12 +891,12 @@
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 6, 12, 24},			/* cost of loading SSE registers
+  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+  {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
@@ -897,6 +933,10 @@
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16:8:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1001,122 +1041,27 @@
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "32:25:8",				/* Loop alignment.  */
+  "32:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "32",					/* Func alignment.  */
 };
 
-/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
+/*  BDVER has optimized REP instruction for medium sized blocks, but for
     very small blocks it is better to use loop. For large blocks, libcall
     can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver1_memcpy[2] = {
+static stringop_algs bdver_memcpy[2] = {
   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
              {-1, rep_prefix_4_byte, false}}},
   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
-static stringop_algs bdver1_memset[2] = {
+static stringop_algs bdver_memset[2] = {
   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 
-const struct processor_costs bdver1_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (1),			/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),			/*				 HI */
-   COSTS_N_INSNS (4),			/*				 SI */
-   COSTS_N_INSNS (6),			/*				 DI */
-   COSTS_N_INSNS (6)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),			/*			    HI */
-   COSTS_N_INSNS (51),			/*			    SI */
-   COSTS_N_INSNS (83),			/*			    DI */
-   COSTS_N_INSNS (83)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
-  {8, 8, 8},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {8, 8, 8},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode */
-  {10, 10, 18},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
-  16, 20,				/* SSE->integer and integer->SSE moves */
-  12, 12,				/* Gather load static, per_elt.  */
-  10, 10,				/* Gather store static, per_elt.  */
-  16,					/* size of l1 cache.  */
-  2048,					/* size of l2 cache.  */
-  64,					/* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,					/* number of parallel prefetches */
-  2,					/* Branch cost */
-  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  bdver1_memcpy,
-  bdver1_memset,
-  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
-};
-
-/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-
-static stringop_algs bdver2_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver2_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-
-const struct processor_costs bdver2_cost = {
+const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1155,12 +1100,12 @@
   {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1194,205 +1139,14 @@
   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  bdver2_memcpy,
-  bdver2_memset,
+  bdver_memcpy,
+  bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
-};
-
-
-  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
-      very small blocks it is better to use loop. For large blocks, libcall
-      can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver3_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver3_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver3_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (1),			/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),			/*				 HI */
-   COSTS_N_INSNS (4),			/*				 SI */
-   COSTS_N_INSNS (6),			/*				 DI */
-   COSTS_N_INSNS (6)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),			/*			    HI */
-   COSTS_N_INSNS (51),			/*			    SI */
-   COSTS_N_INSNS (83),			/*			    DI */
-   COSTS_N_INSNS (83)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
-  {8, 8, 8},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {8, 8, 8},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode */
-  {10, 10, 18},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
-  16, 20,				/* SSE->integer and integer->SSE moves */
-  12, 12,				/* Gather load static, per_elt.  */
-  10, 10,				/* Gather store static, per_elt.  */
-  16,					/* size of l1 cache.  */
-  2048,					/* size of l2 cache.  */
-  64,					/* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,					/* number of parallel prefetches */
-  2,					/* Branch cost */
-  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  bdver3_memcpy,
-  bdver3_memset,
-  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
-};
-
-/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver4_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver4_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver4_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (1),			/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),			/*				 HI */
-   COSTS_N_INSNS (4),			/*				 SI */
-   COSTS_N_INSNS (6),			/*				 DI */
-   COSTS_N_INSNS (6)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),			/*			    HI */
-   COSTS_N_INSNS (51),			/*			    SI */
-   COSTS_N_INSNS (83),			/*			    DI */
-   COSTS_N_INSNS (83)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
-  {8, 8, 8},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {8, 8, 8},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode */
-  {10, 10, 18},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
-  16, 20,				/* SSE->integer and integer->SSE moves */
-  12, 12,				/* Gather load static, per_elt.  */
-  10, 10,				/* Gather store static, per_elt.  */
-  16,					/* size of l1 cache.  */
-  2048,					/* size of l2 cache.  */
-  64,					/* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,					/* number of parallel prefetches */
-  2,					/* Branch cost */
-  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  bdver4_memcpy,
-  bdver4_memset,
-  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "11",					/* Func alignment.  */
 };
 
 
@@ -1458,12 +1212,12 @@
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
+  {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE moves.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
@@ -1513,8 +1267,107 @@
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16",					/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
+/* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
+static stringop_algs skylake_memcpy[2] =   {
+  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
+  {libcall, {{16, loop, false}, {512, unrolled_loop, false},
+             {-1, libcall, false}}}};
+
+static stringop_algs skylake_memset[2] = {
+  {libcall, {{6, loop_1_byte, true},
+             {24, loop, true},
+             {8192, rep_prefix_4_byte, true},
+             {-1, libcall, false}}},
+  {libcall, {{24, loop, true}, {512, unrolled_loop, false},
+             {-1, libcall, false}}}};
+
+static const
+struct processor_costs skylake_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  /* Expanding div/mod currently doesn't consider parallelism. So the cost
+     model is not realistic. We compensate by increasing the latencies a bit.  */
+  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (11),			/*			    HI */
+   COSTS_N_INSNS (14),			/*			    SI */
+   COSTS_N_INSNS (76),			/*			    DI */
+   COSTS_N_INSNS (76)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (0),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 3},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
+  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  2, 2,					/* SSE->integer and integer->SSE moves */
+  20, 8,				/* Gather load static, per_elt.  */
+  22, 10,				/* Gather store static, per_elt.  */
+  64,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  skylake_memcpy,
+  skylake_memset,
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
+};
   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
      very small blocks it is better to use loop. For large blocks, libcall can
      do nontemporary accesses and beat inline considerably.  */
@@ -1567,12 +1420,12 @@
   {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1603,6 +1456,10 @@
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "11",					/* Func alignment.  */
 };
 
 static stringop_algs btver2_memcpy[2] = {
@@ -1654,12 +1511,12 @@
   {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1690,6 +1547,10 @@
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "11",					/* Func alignment.  */
 };
 
 static stringop_algs pentium4_memcpy[2] = {
@@ -1776,6 +1637,10 @@
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  NULL,					/* Loop alignment.  */
+  NULL,					/* Jump alignment.  */
+  NULL,					/* Label alignment.  */
+  NULL,					/* Func alignment.  */
 };
 
 static stringop_algs nocona_memcpy[2] = {
@@ -1865,6 +1730,10 @@
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  NULL,					/* Loop alignment.  */
+  NULL,					/* Jump alignment.  */
+  NULL,					/* Label alignment.  */
+  NULL,					/* Func alignment.  */
 };
 
 static stringop_algs atom_memcpy[2] = {
@@ -1952,6 +1821,10 @@
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 static stringop_algs slm_memcpy[2] = {
@@ -2039,6 +1912,10 @@
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 static stringop_algs intel_memcpy[2] = {
@@ -2126,6 +2003,10 @@
   intel_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16",					/* Loop alignment.  */
+  "16:8:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
@@ -2154,11 +2035,11 @@
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
    COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (2)},			/*			      other */
+   COSTS_N_INSNS (4)},			/*			      other */
   0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),			/*			    HI */
-   COSTS_N_INSNS (42),			/*			    SI */
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (22),			/*			    HI */
+   COSTS_N_INSNS (30),			/*			    SI */
    COSTS_N_INSNS (74),			/*			    DI */
    COSTS_N_INSNS (74)},			/*			    other */
   COSTS_N_INSNS (1),			/* cost of movsx */
@@ -2168,8 +2049,8 @@
 
   /* All move costs are relative to integer->integer move times 2 and thus
      they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
-  {4, 4, 4},				/* cost of loading integer registers
+  6,				     /* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
@@ -2186,13 +2067,13 @@
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 15, 20},			/* cost of unaligned loads.  */
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
-  20, 20,				/* SSE->integer and integer->SSE moves */
-  6, 6,					/* Gather load static, per_elt.  */
-  6, 6,					/* Gather store static, per_elt.  */
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  18, 6,				/* Gather load static, per_elt.  */
+  18, 6,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2201,11 +2082,11 @@
      value is increased to perhaps more appropriate value of 5.  */
   3,					/* Branch cost */
   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
 
   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
@@ -2213,15 +2094,19 @@
   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
-  COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
-  COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
-  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -2250,14 +2135,15 @@
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   /* Here we tune for Sandybridge or newer.  */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (8),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (8),			/*			    HI */
-   /* 8-11 */
-   COSTS_N_INSNS (11),			/*			    SI */
-   /* 24-81 */
+  /* Expanding div/mod currently doesn't consider parallelism. So the cost
+     model is not realistic. We compensate by increasing the latencies a bit.  */
+  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (11),			/*			    HI */
+   COSTS_N_INSNS (14),			/*			    SI */
    COSTS_N_INSNS (81),			/*			    DI */
    COSTS_N_INSNS (81)},			/*			    other */
   COSTS_N_INSNS (1),			/* cost of movsx */
@@ -2324,5 +2210,9 @@
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
 };