How to create a lag variable within each group?
You could do this within data.table
library(data.table)
data[, lag.value:=c(NA, value[-.N]), by=groups]
data
# time groups value lag.value
#1: 1 a 0.02779005 NA
#2: 2 a 0.88029938 0.02779005
#3: 3 a -1.69514201 0.88029938
#4: 1 b -1.27560288 NA
#5: 2 b -0.65976434 -1.27560288
#6: 3 b -1.37804943 -0.65976434
#7: 4 b 0.12041778 -1.37804943
For multiple columns:
nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
data
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132
Update
From data.table
versions >= v1.9.5
, we can use shift
with type
as lag
or lead
. By default, the type is lag
.
data[, (nm2) := shift(.SD), by=groups, .SDcols=nm1]
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132
If you need the reverse, use type=lead
nm3 <- paste("lead", nm1, sep=".")
Using the original dataset
data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
# time groups value value1 value2 lead.value lead.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA
# lead.value2
#1: -0.04493361
#2: -0.01619026
#3: NA
#4: 0.82122120
#5: 0.59390132
#6: 0.91897737
#7: NA
data
set.seed(1)
data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
R: Group-by lag variable generating different within-group lag values
We can create a subset of data frame based on unique qy
and id
, create the lag columns value_t1
and value_t2
, and then merge back to the original data frame.
library(dplyr)
library(data.table)
library(lubridate)
# Create example data frame
set.seed(123)
v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)
# Process the data
df2 <- df %>%
distinct(id, qy, .keep_all = TRUE) %>%
group_by(id) %>%
mutate(value_t1 = lag(value_t, n = 1L),
value_t2 = lag(value_t, n = 2L)) %>%
select(-value_t, -value2_t) %>%
ungroup() %>%
left_join(df, ., by = c("qy", "id"))
df2
# qy id value_t value2_t value_t1 value_t2
# 1 2016-01-01 a 0 29 NA NA
# 2 2016-01-01 a 0 79 NA NA
# 3 2016-04-01 a 1 5 0 NA
# 4 2016-04-01 a 1 50 0 NA
# 5 2016-10-01 a 0 87 1 0
# 6 2016-10-01 a 0 98 1 0
# 7 2016-01-01 b 1 41 NA NA
# 8 2016-01-01 b 1 86 NA NA
# 9 2016-04-01 b 0 83 1 NA
# 10 2016-04-01 b 0 51 1 NA
# 11 2016-10-01 b 1 60 0 1
# 12 2016-10-01 b 1 94 0 1
# 13 2016-01-01 c 0 91 NA NA
# 14 2016-04-01 c 0 42 0 NA
# 15 2016-10-01 c 1 9 0 0
Create lag variable with conditions and group by id
Is there a way to return for a fiscal year the last monthly stock price at previous fiscal year?
out = unique(dt[, .(id, fyear, fyear_start, fyear_end)])
out[, prc_end := {
dt[.(id = .SD$id, prc_month_end = .SD$fyear_start - 1L), on=.(id, prc_month_end), roll=TRUE, x.prc]
}]
id fyear fyear_start fyear_end prc_end
1: 59328 2001 2001-01-01 2001-12-31 NA
2: 59328 2002 2002-01-01 2002-12-31 31.45
3: 59328 2003 2003-01-01 2003-12-31 15.57
4: 61241 2001 2000-07-01 2001-06-30 NA
5: 61241 2002 2001-07-01 2002-06-30 15.86
6: 61241 2003 2002-07-01 2003-06-30 6.46
This is a rolling update join: For rows of table out
- Construct the lookup vectors
.(id, fyear_start - 1)
using.SD = out
, the subset of data - Lookup rows of
dt
, "rolling" the last vector,fyear_start - 1
, to the nearest earlier date - Take matched values of
x.prc
, theprc
column fromdt
The notation x.*
comes from the x[i]
join/lookup syntax. For more details, see ?data.table
.
lag multiple variables grouped by columns
Check with shift
then concat
lag=[1,2]
df=pd.concat([df]+[df.groupby('grp')['var1','var2','var3'].shift(x).add_prefix('lag'+str(x)) for x in lag],axis=1)
R - Computations with lag variable by group
There are a couple of problems in your code. First, create your data.frame
like this:
data.df <- data.frame(origin, dest, year, type, a, b)
This will retain the class of all the vectors. Note that if you don't want origin
and dest
to be factors, just use the argument stringsAsFactors = FALSE
in the data.frame()
function.
Next, create your new variable as follows:
data.df2 <- data.df %>%
group_by(origin, dest, type) %>%
arrange(year) %>%
mutate(new_var = (a - lag(a)) * b) %>%
ungroup()
Here, new_var
is the variable that you want. You are right in that dplyr
doesn't know that the lagged value is from the previous time period. Therefore, you have to use arrange(year)
.
How to get the lagged values of a variable based on groups with pandas?
Use Series.shift
for next value, replace if matching original values and then repeat values by forward and back filling missing values:
s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill()
print (toy_df)
var new
0 1 1.0
1 1 1.0
2 1 1.0
3 2 1.0
4 2 1.0
5 3 2.0
6 1 3.0
7 1 3.0
8 2 1.0
9 4 2.0
10 4 2.0
11 4 2.0
If want convert values to integers:
s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill().astype(int)
print (toy_df)
var new
0 1 1
1 1 1
2 1 1
3 2 1
4 2 1
5 3 2
6 1 3
7 1 3
8 2 1
9 4 2
10 4 2
11 4 2
R Faster way to create lag by entire group using data.table
Here is another option:
dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]]
timing code:
library(data.table)
set.seed(0L)
nr <- 1e6
ng <- 1e5
dt = data.table(ID=sample(ng, nr, TRUE), val=as.character(sample(nr, nr, TRUE)))
setorder(dt, ID, val)
microbenchmark::microbenchmark(times = 3L,
opt = dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]],
rle = dt[, val_lag := with(rle(val), rep(c(NA, head(values, -1)), lengths)), by = ID]
)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
opt 133.8857 159.8922 265.2029 185.8987 330.8614 475.8242 3
rle 3097.6005 3123.5422 3193.2654 3149.4839 3241.0978 3332.7117 3
edit: added an example of what is happening:
index | 1 2 3 4 5 6 7 8 9 10
value | a a a b b c c c d d
shifted (s) | NA a a a b b c c c d
rowid+rleid | 1 2 3 1 2 1 2 3 1 2
replace | 1 NA NA 4 NA 6 NA NA 9 NA <In ?nafill, Only double and integer data types are currently supported. Hence, nafill the indices before accessing>
nafill | 1 1 1 4 4 6 6 6 9 9
using s above | s[1] s[1] s[1] s[4] s[4] s[6] s[6] s[6] s[9] s[9]
Related Topics
Find All Combinations of a Set of Numbers That Add Up to a Certain Total
R: Error in Usemethod("Tbl_Vars")
How to Dplyr Rename a Column, by Column Index
Delete Rows Containing Specific Strings in R
Creating Grouped Bar-Plot of Multi-Column Data in R
How to Convert a Data Frame Column to Numeric Type
Break Dataframe into Smaller Dataframe'S and Save Them
How to Replace Negative Values in a Dataframe Column With a Different Value
Conditionally Remove Rows from a Database Using R
Adding Some Space Between the X-Axis and the Bars, in Ggplot
Convert Categorical Variables to Numeric in R
How to Select Variables in an R Dataframe Whose Names Contain a Particular String
How to Add a Diagonal Line to a Plot
Gsub a Every Element After a Keyword in R
Change the Class from Factor to Numeric of Many Columns in a Data Frame
Convert Dataframe Column to 1 or 0 for "True"/"False" Values and Assign to Dataframe