Contents

[SQL] RFM Analysis with SQL

What is RFM?

RFM is a technique that can perform Customer Segmentation to determine customer’s buying behavior. Company uses the RFM metric as a customer behavior segmentation indicator to improve marketing strategies for revenue increases by reactivating customers to more royal.

  • R(Recency) : The last time the customer made a purchase. The smaller the number, the better.

  • F(Frequency) : Number of transactions. The bigger the number, the better

  • M(Monetary) : The spending power of a customer. The bigger the number, the better.


Data points used in RFM Analysis

  • Recency - last order date
  • Frequency - count of total orders
  • Monetary value - total spend

Understand Data (SQL)

  • Check the whole data
1
2
SELECT * 
FROM sales_data
  • Inspecting Data
1
2
3
4
5
6
7
8
9
SELECT DISTINCT status FROM sales_data -- NICE TO PLOT
SELECT DISTINCT year_id FROM sales_data
SELECT DISTINCT PRODUCTLINE FROM sales_data -- NICE TO PLOT 
SELECT DISTINCT COUNTRY FROM sales_data -- NICE TO PLOT
SELECT DISTINCT DEALSIZE FROM sales_data -- NICE TO PLOT
SELECT DISTINCT TERRITORY FROM sales_data -- NICE TO PLOT 

SELECT DISTINCT MONTH_ID FROM sales_data
WHERE year_id = 2003

Analysis

1. Salse

  • By Product
1
2
3
4
SELECT PRODUCTLINE, SUM(sales) AS revenue
FROM sales_data
GROUP BY PRODUCTLINE
ORDER BY 2 DESC; 
  • By year
1
2
3
4
SELECT YEAR_ID, SUM(sales) AS revenue
FROM sales_data
GROUP BY YEAR_ID
ORDER BY 2 DESC; 
  • By Dealsize
1
2
3
4
SELECT DEALSIZE, SUM(sales) AS revenue
FROM sales_data
GROUP BY DEALSIZE
ORDER BY 2 DESC; 

2. Best Month for sales per month

1
2
3
4
5
6
7
8
SELECT MONTH_ID, 
      SUM(sales) AS Revenue, 
      COUNT(ORDERNUMBER) AS Frequency
FROM sales_data
WHERE YEAR_ID = 2003 -- CAN CHANGE YEAR TO 2004
GROUP BY MONTH_ID 
ORDER BY Revenue DESC;

1
2
3
4
5
6
7
8
SELECT MONTH_ID, 
      SUM(sales) AS Revenue, 
      COUNT(ORDERNUMBER) AS Frequency
FROM sales_data
WHERE YEAR_ID = 2003 AND MONTH_ID = 11 -- CAN CHANGE YEAR TO 2004
GROUP BY MONTH_ID, PRODUCTLINE 
ORDER BY Revenue DESC;

3. WHO IS OUR BEST CUSTOMER (this could be answered with RFM)

1
2
3
4
5
6
7
8
9
SELECT 
  CUSTOMERNAME, 
  SUM(sales) AS MonetaryValue, 
  AVG(sales) AS AvgMonetaryValue, 
  COUNT(ORDERNUMBER) AS Frequency, 
  MAX(ORDERDATE) AS last_order_date, 
  DATEDIFF(DD, MAX(ORDERDATE), (SELECT MAX(ORDERDATE) FROM sales_data AS max_order_date)) AS Recency
FROM sales_data
GROUP BY CUSTOMERNAME

4. Categorize the RFM

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
WITH rfm AS (
  SELECT 
    CUSTOMERNAME, 
    SUM(sales) AS MonetaryValue, 
    AVG(sales) AS AvgMonetaryValue, 
    COUNT(ORDERNUMBER) AS Frequency, 
    MAX(ORDERDATE) AS last_order_date, 
    DATEDIFF(DD, MAX(ORDERDATE), (SELECT MAX(ORDERDATE) FROM sales_data AS max_order_date)) AS Recency
  FROM sales_data
  GROUP BY CUSTOMERNAME
)

rfm_calc AS (
  SELECT R.*, 
        NTILE(4) OVER (ORDER BY Recency) rfm_recency, 
        NTILE(4) OVER (ORDER BY Frequency) rfm_frequency,
        NTILE(4) OVER (ORDER BY AvgMonetaryValue) rfm_monetary,  
  FROM rfm R
  ORDER BY 4 DESC
)

SELECT 
  C.*, 
  CAST(rfm_recency AS VARCHAR) + CAST(rfm_frequency AS VARCHAR) + CAST(rfm_monetary AS VARCHAR) AS rfm_cell_string
FROM rfm_calc C

;

5. Create TEMP Table

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
DROP TABLE IF EXISTS #rfm --for global table ##rfm

WITH rfm AS (
  SELECT 
    CUSTOMERNAME, 
    SUM(sales) AS MonetaryValue, 
    AVG(sales) AS AvgMonetaryValue, 
    COUNT(ORDERNUMBER) AS Frequency, 
    MAX(ORDERDATE) AS last_order_date, 
    DATEDIFF(DD, MAX(ORDERDATE), (SELECT MAX(ORDERDATE) FROM sales_data AS max_order_date)) AS Recency
  FROM sales_data
  GROUP BY CUSTOMERNAME
),

  rfm_calc AS (
    SELECT R.*, 
          NTILE(4) OVER (ORDER BY Recency) rfm_recency, 
          NTILE(4) OVER (ORDER BY Frequency) rfm_frequency,
          NTILE(4) OVER (ORDER BY MonetaryValue) rfm_monetary,  
    FROM rfm R
    ORDER BY 4 DESC
  )

SELECT 
  C.*, 
  CAST(rfm_recency AS VARCHAR) + CAST(rfm_frequency AS VARCHAR) + CAST(rfm_monetary AS VARCHAR) AS rfm_cell_string
into #rfm
FROM rfm_calc C

;
1
2
3
4
5
6
7
8
9
SELECT CUSTOMERNAME, rfm_recency, rfm_freqeuncy, rfm_monetary, 
  CASE
   WHEN rfm_cell_string IN (111, 112, 121, 122, 123, 132, 211, 212, 114, 141) THEN 'lost_customers'
   WHEN rfm_cell_string IN (133, 134, 143, 244, 334, 343, 344) THEN 'slipping away, cannot lose'
   WHEN rfm_cell_string IN (311, 411, 331) THEN 'new_customers' 
   WHEN rfm_cell_string IN (222, 223, 233, 322) THEN 'potential churners'
   WHEN rfm_cell_string IN (323, 333, 321, 422, 332, 432) THEN 'active'
   WHEN rfm_cell_string IN (433, 434, 443, 444) THEN 'loyal'
FROM #rfm

6. What products codes are sold together?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
SELECT DISTINCT ORDERNUMBER, STUFF(

  (SELECT ',' + PRODUCTCODE
  FROM sales_data P
  WHERE ORDERNUMBER IN 
    (
      SELECT ORDERNUMBER
      FROM (
        SELECT ORDERNUMBER, COUNT(*) AS rn, 
        FROM sales_data
        WHERE STATUS='Shipped'
        GROUP BY ORDERNUMBER
      ) m 
      where rn = 2 -- CAN CHANGE THE NUMBER
    )
    AND p.ORDERNUMBER = s.ORDERNUMBER
    for xml path ('')), 
    1, 1, '')
FROM sales_data S
ORDER BY 2 DESC