R: Pulling data from one column to create new columns
using dplyr
and tidyr
.
First we insert a "." in the sample code, next we separate sample into 3 columns.
library(dplyr)
library(tidyr)
df %>%
mutate(sample = paste0(substring(df$sample, 1, 1), ".", substring(df$sample, 2))) %>%
separate(sample, into = c("tissue", "plant", "stage"), remove = FALSE)
sample tissue plant stage
1 P.10.1 P 10 1
2 P.11.2 P 11 2
3 S.1.1 S 1 1
4 S.3.3 S 3 3
data:
df <- structure(list(sample = c("P10.1", "P11.2", "S1.1", "S3.3")),
.Names = "sample",
class = "data.frame",
row.names = c(NA, -4L))
Creating new columns using data in one column and fill with data
In base R
, we can use table
out <-table(dt$col, dt$col)
-output
out
a b c d e f
a 1 0 0 0 0 0
b 0 1 0 0 0 0
c 0 0 1 0 0 0
d 0 0 0 1 0 0
e 0 0 0 0 1 0
f 0 0 0 0 0 1
Or use diag
`dimnames<-`(diag(nrow(dt)), list(dt$col, dt$col))
How to create new columns based on values and names of existing columns in R?
I believe this is the result you're looking for; this solution uses tidyverse functions.
library(tidyverse)
# create empty tibble to store results
df.out <- tibble()
# loop over years
for (i in unique(df$Year)) {
this.year <- df %>%
# grab number of rows for this year
filter(Year == i) %>%
# only grab the weather columns for this specific year
select(Experiment, Site, Year, contains(paste0("_", i, "_"))) %>%
# this function uses a regex to rename columns with "_" and the current year by removing the part of the name with "_" then 4 numbers
rename_with(function(x) {str_replace(x, "\\_[0-9][0-9][0-9][0-9]", "")}, contains(paste0("_", i, "_")))
# add this year to your output tibble
df.out <- df.out %>%
bind_rows(this.year)
}
R - Create new column based on substring from another column with conditions
There is probably a more efficient way to do this, but we could do a series of ifelse
statements using case_when
from tidyverse
. First, I remove any rows that just end with ;s__
. Then, in the series of statements, I check to if a given taxonomic level is present, then if so, then return that in the desired format. Then, that is repeated across all taxonomic levels.
library(tidyverse)
output <- input_data %>%
mutate(taxon = trimws(taxon, whitespace = ";s__")) %>%
mutate(taxon_main = case_when(str_detect(taxon, "s__") ~ trimws(str_replace_all(str_extract(taxon, "(?<=g__).*"), ";s_", ""), whitespace = '_'),
!str_detect(taxon, "s__") & str_detect(taxon, "g__")~ str_replace_all(str_extract(taxon, "g__.*"), "__", "_"),
!str_detect(taxon, "g__") & str_detect(taxon, "f__") ~ str_replace_all(str_extract(taxon, "f__.*"), "__", "_"),
!str_detect(taxon, "f__") & str_detect(taxon, "o__")~ str_replace_all(str_extract(taxon, "o__.*"), "__", "_"),
!str_detect(taxon, "o__") & str_detect(taxon, "c__")~ str_replace_all(str_extract(taxon, "c__.*"), "__", "_"),
!str_detect(taxon, "c__") & str_detect(taxon, "p__")~ str_replace_all(str_extract(taxon, "p__.*"), "__", "_"),
!str_detect(taxon, "p__") & str_detect(taxon, "k__")~ str_replace_all(str_extract(taxon, "k__.*"), "__", "_"),
TRUE ~ NA_character_))
Output
output %>% select(taxon_main)
taxon_main
1 Lactobacillus_crispatus
2 g_Anaerococcus
3 f_Comamonadaceae
4 f_Lachnospiraceae
5 Bosea_massiliensis
6 Acinetobacter_baumannii
7 f_Methylophilaceae
Or you could also use separate
first, which will make the code less reliant on using a lot of stringr
. We can clean up before using separate
, such as only having one underscore and remove extra s__
. Then, we can go through the ifelse
statements, and then we can bind back to the original taxon
column and drop all the other columns, except for taxon_main
.
input_data %>%
mutate(taxon = trimws(taxon, whitespace = ";s__"),
taxon = str_replace_all(taxon, ";s__", ";"),
taxon = str_replace_all(taxon, "__", "_")) %>%
separate(taxon, sep = ";", into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>%
mutate(taxon_main = case_when(!is.na(Species) ~ paste(str_extract(Genus, "(?<=g_).*"), Species, sep = "_"),
is.na(Species) & !is.na(Genus) ~ Genus,
is.na(Genus) & !is.na(Family) ~ Family,
is.na(Family) & !is.na(Order) ~ Order,
is.na(Order) & !is.na(Class) ~ Class,
is.na(Class) & !is.na(Phylum) ~ Phylum,
is.na(Phylum) & !is.na(Kingdom) ~ Kingdom
)) %>%
bind_cols(input_data,.) %>%
select(taxon_main, taxon)
Output
taxon_main taxon
1 Lactobacillus_crispatus k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__crispatus
2 g_Anaerococcus k__Bacteria;p__Firmicutes;c__Tissierellia;o__Tissierellales;f__Peptoniphilaceae;g__Anaerococcus;s__
3 f_Comamonadaceae k__Bacteria;p__Proteobacteria;c__Betap__Proteobacteria;o__Burkholderiales;f__Comamonadaceae
4 f_Lachnospiraceae k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae
5 Bosea_massiliensis k__Bacteria;p__Proteobacteria;c__Alphap__Proteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Bosea;s__massiliensis
6 Acinetobacter_baumannii k__Bacteria;p__Proteobacteria;c__Gammap__Proteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__baumannii
7 f_Methylophilaceae k__Bacteria;p__Proteobacteria;c__Betap__Proteobacteria;o__Nitrosomonadales;f__Methylophilaceae
R Creating new columns based on factors in another column
Try this
df %>% group_by(category) %>% mutate(id = row_number()) %>% ungroup() %>% pivot_wider(id_cols = id, names_from = category, values_from = cond) %>% select(-id)
Taking variable names out of column and creating new columns in R
This can be accomplished a few ways. Might be a good opportunity to get to know the tidyverse:
library(tidyverse)
new.df <- spread(old.df, response, response)
This is an unusual use of tidyr::spread()
. In this case, it constructs new column names from the values in "response", and also fills those columns with the values in "response". The fill
argument can be used to change what goes in the resulting blank cells.
How to get R to create new column (named from left part of string in old column), and then put right part of string from old column into new column
Starting with
quux <- structure(list(oldColumn1 = c("COLOR: RED", "COLOR: RED", "COLOR: BLUE", "COLOR: GREEN", "COLOR: BLUE")), class = "data.frame", row.names = c(NA, -5L))
The naive approach would be
data.frame(COLOR = trimws(sub("COLOR:", "", quux$oldColumn1)))
# COLOR
# 1 RED
# 2 RED
# 3 BLUE
# 4 GREEN
# 5 BLUE
But I'm assuming you have a more generic need. Let's assume that you have some more things to parse out of that, such as
quux <- structure(list(oldColumn1 = c("COLOR: RED", "COLOR: RED", "COLOR: BLUE", "COLOR: GREEN", "COLOR: BLUE", "SIZE: 1", "SIZE: 3", "SIZE: 5")), class = "data.frame", row.names = c(NA, -8L))
quux
# oldColumn1
# 1 COLOR: RED
# 2 COLOR: RED
# 3 COLOR: BLUE
# 4 COLOR: GREEN
# 5 COLOR: BLUE
# 6 SIZE: 1
# 7 SIZE: 3
# 8 SIZE: 5
then we can generalize it with
tmp <- strcapture("(.*)\\s*:\\s*(.*)", quux$oldColumn1, list(k="", v=""))
tmp$ign <- ave(rep(1L, nrow(tmp)), tmp$k, FUN = seq_along)
reshape2::dcast(tmp, ign ~ k, value.var = "v")[,-1,drop=FALSE]
# COLOR SIZE
# 1 RED 1
# 2 RED 3
# 3 BLUE 5
# 4 GREEN <NA>
# 5 BLUE <NA>
--
Edit: alternative with updated data:
do.call(cbind, lapply(dat, function(X) {
nm <- sub(":.*", "", X[1])
out <- data.frame(trimws(sub(".*:", "", X)))
names(out) <- nm
out
}))
# COLOR SIZE DESIGNSTYLE
# 1 RED LARGE STYLED
# 2 RED MEDIUM ORIGINAL MAKER
# 3 BLUE XLARGE COUTURE
# 4 GREEN MEDIUM COUTURE
# 5 BLUE SMALL STYLED
create new column based on existing pattern R
We may use regex_left_join
library(data.table)
library(fuzzyjoin)
regex_left_join(tableRules, data.table(DIMENSION = listDimPoss),
by = c("object_name" = "DIMENSION"))
object_name DIMENSION
1 instr_asset_row instr_asset
2 functional_cat functional_cat
3 ref_sector_second ref_sector
Related Topics
Remove 'A' from Legend When Using Aesthetics and Geom_Text
Insert Rows For Missing Dates/Times
How to Subset Matrix to One Column, Maintain Matrix Data Type, Maintain Row/Column Names
How to Convert Variable With Mixed Date Formats to One Format
Filter Multiple Values on a String Column in Dplyr
Force the Origin to Start At 0
Why Can't R'S Ifelse Statements Return Vectors
Error: C Stack Usage Is Too Close to the Limit
Create a Sequential Number (Counter) For Rows Within Each Group of a Dataframe
Why Does Data.Table Update Names(Dt) by Reference, Even If I Assign to Another Variable
Using Reshape from Wide to Long in R
General Suggestions For Debugging in R
Filter Rows Which Contain a Certain String
Explicitly Calling Return in a Function or Not
Filtering a Data Frame by Values in a Column